Add tentative WIP stats module
This commit is contained in:
parent
d93d2c2f6e
commit
3cb2c508a0
8 changed files with 410 additions and 0 deletions
2
stats/.gitignore
vendored
Normal file
2
stats/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
venv
|
||||
elf_data
|
11
stats/README.md
Normal file
11
stats/README.md
Normal file
|
@ -0,0 +1,11 @@
|
|||
# Statistical scripts
|
||||
|
||||
Computes stats about a whole lot of stuff.
|
||||
|
||||
## Setup
|
||||
|
||||
```sh
|
||||
virtualenv -p python3 venv # Do this only once
|
||||
source venv/bin/activate # Do this for every new shell working running the script
|
||||
pip install -r requirements.txt # Do this only once
|
||||
```
|
0
stats/__init__.py
Normal file
0
stats/__init__.py
Normal file
82
stats/fde_stats.py
Executable file
82
stats/fde_stats.py
Executable file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import gather_stats
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self):
|
||||
args = self.parse_args()
|
||||
self._cores = args.cores
|
||||
self.feature = args.feature
|
||||
|
||||
if args.feature == 'gather':
|
||||
self.output = args.output
|
||||
|
||||
elif args.feature == 'analyze':
|
||||
self.data_file = args.data_file
|
||||
|
||||
@property
|
||||
def cores(self):
|
||||
if self._cores <= 0:
|
||||
return None
|
||||
return self._cores
|
||||
|
||||
def parse_args(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Gather statistics about system-related ELFs")
|
||||
|
||||
parser.add_argument('--cores', '-j', default=1, type=int,
|
||||
help=("Use N cores for processing. Defaults to "
|
||||
"1. 0 to use up all cores."))
|
||||
|
||||
subparsers = parser.add_subparsers(help='Subcommands')
|
||||
|
||||
# Gather stats
|
||||
parser_gather = subparsers.add_parser(
|
||||
'gather',
|
||||
help=('Gather system data into a file, to allow multiple '
|
||||
'analyses without re-scanning the whole system.'))
|
||||
parser_gather.set_defaults(feature='gather')
|
||||
parser_gather.add_argument('--output', '-o',
|
||||
default='elf_data',
|
||||
help=('Output data to this file. Defaults '
|
||||
'to "elf_data"'))
|
||||
|
||||
# Analyze stats
|
||||
parser_analyze = subparsers.add_parser(
|
||||
'analyze',
|
||||
help='Analyze data gathered by a previous run.')
|
||||
parser_analyze.set_defaults(feature='analyze')
|
||||
parser_analyze.add_argument('data_file',
|
||||
default='elf_data',
|
||||
help=('Analyze this data file. Defaults '
|
||||
'to "elf_data".'))
|
||||
# TODO histogram?
|
||||
|
||||
out = parser.parse_args()
|
||||
if 'feature' not in out:
|
||||
print("No subcommand specified.", file=sys.stderr)
|
||||
parser.print_usage(file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
config = Config()
|
||||
|
||||
if config.feature == 'gather':
|
||||
stats_accu = gather_stats.gather_system_files(config)
|
||||
stats_accu.serialize(config.output)
|
||||
|
||||
elif config.feature == 'analyze':
|
||||
# TODO
|
||||
print("Not implemented", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
60
stats/gather_stats.py
Normal file
60
stats/gather_stats.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
from pyelftools_overlay import system_elfs
|
||||
import pathos
|
||||
import signal
|
||||
import itertools
|
||||
|
||||
from stats_accu import StatsAccumulator
|
||||
|
||||
|
||||
class FilesProcessor:
|
||||
def __init__(self, cores, stats_accu=None):
|
||||
self.stop_processing = False
|
||||
self._processed_counter = itertools.count()
|
||||
self.cores = cores
|
||||
|
||||
if stats_accu is None:
|
||||
stats_accu = StatsAccumulator()
|
||||
self.stats_accu = stats_accu
|
||||
|
||||
def stop_processing_now(self):
|
||||
self.stop_processing = True
|
||||
|
||||
def next_counter(self):
|
||||
return self._processed_counter.__next__()
|
||||
|
||||
def run(self, elf_list):
|
||||
self.elf_count = len(elf_list)
|
||||
with pathos.multiprocessing.ProcessPool(nodes=self.cores) as pool:
|
||||
pool.map(self.process_single_file, elf_list)
|
||||
|
||||
def process_single_file(self, elf_path):
|
||||
if self.stop_processing:
|
||||
return
|
||||
|
||||
cur_file_count = self.next_counter()
|
||||
print('> [{}/{} {:.0f}%] {}'.format(
|
||||
cur_file_count, self.elf_count,
|
||||
cur_file_count / self.elf_count * 100, elf_path))
|
||||
self.stats_accu.process_file(elf_path)
|
||||
|
||||
|
||||
def gather_system_files(config):
|
||||
stats_accu = StatsAccumulator()
|
||||
processor = FilesProcessor(config.cores, stats_accu)
|
||||
|
||||
def signal_graceful_exit(sig, frame):
|
||||
''' Stop gracefully now '''
|
||||
nonlocal processor
|
||||
|
||||
print("Stopping after this ELF…")
|
||||
processor.stop_processing_now()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_graceful_exit)
|
||||
|
||||
elf_list = []
|
||||
for elf_path in system_elfs():
|
||||
elf_list.append(elf_path)
|
||||
|
||||
processor.run(elf_list)
|
||||
|
||||
return stats_accu
|
76
stats/pyelftools_overlay.py
Normal file
76
stats/pyelftools_overlay.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
""" Overlay of PyElfTools for quick access to what we want here """
|
||||
|
||||
from elftools.elf.elffile import ELFFile
|
||||
from elftools.common.exceptions import ELFError, DWARFError
|
||||
import os
|
||||
|
||||
|
||||
def get_cfi(path):
|
||||
''' Get the CFI entries from the ELF at the provided path '''
|
||||
|
||||
try:
|
||||
with open(path, 'rb') as file_handle:
|
||||
elf_file = ELFFile(file_handle)
|
||||
|
||||
if not elf_file.has_dwarf_info():
|
||||
return None
|
||||
|
||||
dw_info = elf_file.get_dwarf_info()
|
||||
if dw_info.has_CFI():
|
||||
cfis = dw_info.CFI_entries()
|
||||
elif dw_info.has_EH_CFI():
|
||||
cfis = dw_info.EH_CFI_entries()
|
||||
else:
|
||||
return None
|
||||
except ELFError:
|
||||
return None
|
||||
except DWARFError:
|
||||
return None
|
||||
except PermissionError:
|
||||
return None
|
||||
|
||||
return cfis
|
||||
|
||||
|
||||
def system_elfs():
|
||||
''' Iterator over system libraries '''
|
||||
|
||||
def readlink_rec(path):
|
||||
if not os.path.islink(path):
|
||||
return path
|
||||
|
||||
return readlink_rec(
|
||||
os.path.join(os.path.dirname(path),
|
||||
os.readlink(path)))
|
||||
|
||||
sysbin_dirs = [
|
||||
'/lib',
|
||||
'/usr/lib',
|
||||
'/usr/local/lib',
|
||||
'/bin',
|
||||
'/usr/bin',
|
||||
'/usr/local/bin',
|
||||
'/sbin',
|
||||
]
|
||||
to_explore = sysbin_dirs
|
||||
|
||||
seen_elfs = set()
|
||||
|
||||
while to_explore:
|
||||
bindir = to_explore.pop()
|
||||
|
||||
if not os.path.isdir(bindir):
|
||||
continue
|
||||
|
||||
for direntry in os.scandir(bindir):
|
||||
if not direntry.is_file():
|
||||
if direntry.is_dir():
|
||||
to_explore.append(direntry.path)
|
||||
continue
|
||||
|
||||
canonical_name = readlink_rec(direntry.path)
|
||||
if canonical_name in seen_elfs:
|
||||
continue
|
||||
|
||||
seen_elfs.add(canonical_name)
|
||||
yield canonical_name
|
2
stats/requirements.txt
Normal file
2
stats/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
git+https://github.com/eliben/pyelftools
|
||||
git+https://github.com/uqfoundation/pathos
|
177
stats/stats_accu.py
Normal file
177
stats/stats_accu.py
Normal file
|
@ -0,0 +1,177 @@
|
|||
from elftools.dwarf import callframe
|
||||
from pyelftools_overlay import get_cfi
|
||||
from enum import Enum
|
||||
import json
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from math import ceil
|
||||
|
||||
|
||||
class ProportionFinder:
|
||||
''' Finds figures such as median, etc. on the original structure of a
|
||||
dictionnary mapping a value to its occurrence count '''
|
||||
|
||||
def __init__(self, count_per_value):
|
||||
self.cumulative = []
|
||||
prev_count = 0
|
||||
for key in sorted(count_per_value.keys()):
|
||||
n_count = prev_count + count_per_value[key]
|
||||
self.cumulative.append(
|
||||
(key, n_count))
|
||||
prev_count = n_count
|
||||
|
||||
self.elem_count = prev_count
|
||||
|
||||
def find_at_proportion(self, proportion):
|
||||
if not self.cumulative: # Empty list
|
||||
return None
|
||||
|
||||
low_bound = ceil(self.elem_count * proportion)
|
||||
|
||||
def binsearch(beg, end):
|
||||
med = ceil((beg + end) / 2)
|
||||
|
||||
if beg + 1 == end:
|
||||
return self.cumulative[beg][0]
|
||||
|
||||
if self.cumulative[med - 1][1] < low_bound:
|
||||
return binsearch(med, end)
|
||||
return binsearch(beg, med)
|
||||
|
||||
return binsearch(0, len(self.cumulative))
|
||||
|
||||
|
||||
def elf_so_deps(path):
|
||||
''' Get the list of shared objects dependencies of the given ELF object.
|
||||
This is obtained by running `ldd`. '''
|
||||
|
||||
deps_list = []
|
||||
|
||||
try:
|
||||
ldd_output = subprocess.check_output(['/usr/bin/ldd', path]) \
|
||||
.decode('utf-8')
|
||||
ldd_re = re.compile(r'^.* => (.*) \(0x[0-9a-fA-F]*\)$')
|
||||
|
||||
ldd_lines = ldd_output.strip().split('\n')
|
||||
for line in ldd_lines:
|
||||
line = line.strip()
|
||||
match = ldd_re.match(line)
|
||||
if match is None:
|
||||
continue # Just ignore that line — it might be eg. linux-vdso
|
||||
deps_list.append(match.group(1))
|
||||
|
||||
return deps_list
|
||||
|
||||
except subprocess.CalledProcessError as exn:
|
||||
raise Exception(
|
||||
("Cannot get dependencies for {}: ldd terminated with exit code "
|
||||
"{}.").format(path, exn.returncode))
|
||||
|
||||
|
||||
class ElfType(Enum):
|
||||
ELF_LIB = auto()
|
||||
ELF_BINARY = auto()
|
||||
|
||||
|
||||
class SingleFdeData:
|
||||
def __init__(self, path, elf_type, data):
|
||||
self.path = path
|
||||
self.elf_type = elf_type
|
||||
self.data = data
|
||||
|
||||
self.gather_deps()
|
||||
|
||||
def gather_deps(self):
|
||||
""" Collect ldd data on the binary """
|
||||
self.deps = elf_so_deps(self.path)
|
||||
|
||||
|
||||
class StatsAccumulator:
|
||||
def __init__(self):
|
||||
self.elf_count = 0
|
||||
self.fde_count = 0
|
||||
self.fde_row_count = 0
|
||||
self.fde_with_n_rows = {}
|
||||
|
||||
def serialize(self, path):
|
||||
''' Save the gathered data to `stream` '''
|
||||
|
||||
notable_fields = [
|
||||
'elf_count',
|
||||
'fde_count',
|
||||
'fde_row_count',
|
||||
'fde_with_n_rows',
|
||||
]
|
||||
out = {}
|
||||
for field in notable_fields:
|
||||
out[field] = self.__dict__[field]
|
||||
|
||||
with open(path, 'wb') as stream:
|
||||
json.dump(out, stream)
|
||||
|
||||
@staticmethod
|
||||
def unserialize(path):
|
||||
out = StatsAccumulator()
|
||||
with open(path, 'wb') as stream:
|
||||
data = json.load(stream)
|
||||
for field in data:
|
||||
out.field = data[field]
|
||||
return out
|
||||
|
||||
def report(self):
|
||||
''' Report on the statistics gathered '''
|
||||
|
||||
self.fde_rows_proportion = ProportionFinder(
|
||||
self.fde_with_n_rows)
|
||||
|
||||
rows = [
|
||||
("ELFs analyzed", self.elf_count),
|
||||
("FDEs analyzed", self.fde_count),
|
||||
("FDE rows analyzed", self.fde_row_count),
|
||||
("Avg. rows per FDE", self.fde_row_count / self.fde_count),
|
||||
("Median rows per FDE",
|
||||
self.fde_rows_proportion.find_at_proportion(0.5)),
|
||||
("Max rows per FDE", max(self.fde_with_n_rows.keys())),
|
||||
]
|
||||
|
||||
title_size = max(map(lambda x: len(x[0]), rows))
|
||||
line_format = "{:<" + str(title_size + 1) + "} {}"
|
||||
|
||||
for row in rows:
|
||||
print(line_format.format(row[0], row[1]))
|
||||
|
||||
def process_file(self, path):
|
||||
''' Process a single file '''
|
||||
|
||||
cfi = get_cfi(path)
|
||||
if not cfi:
|
||||
return
|
||||
|
||||
self.elf_count += 1
|
||||
|
||||
for entry in cfi:
|
||||
if isinstance(entry, callframe.CIE): # Is a CIE
|
||||
self.process_cie(entry)
|
||||
elif isinstance(entry, callframe.FDE): # Is a FDE
|
||||
self.process_fde(entry)
|
||||
|
||||
def incr_cell(self, table, key):
|
||||
''' Increments table[key], or sets it to 1 if unset '''
|
||||
if key in table:
|
||||
table[key] += 1
|
||||
else:
|
||||
table[key] = 1
|
||||
|
||||
def process_cie(self, cie):
|
||||
''' Process a CIE '''
|
||||
pass # Nothing needed from a CIE
|
||||
|
||||
def process_fde(self, fde):
|
||||
''' Process a FDE '''
|
||||
self.fde_count += 1
|
||||
|
||||
decoded = fde.get_decoded()
|
||||
row_count = len(decoded.table)
|
||||
self.fde_row_count += row_count
|
||||
self.incr_cell(self.fde_with_n_rows, row_count)
|
Loading…
Reference in a new issue