Add tentative WIP stats module

This commit is contained in:
Théophile Bastian 2018-07-10 14:41:33 +02:00
parent d93d2c2f6e
commit 3cb2c508a0
8 changed files with 410 additions and 0 deletions

2
stats/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
venv
elf_data

11
stats/README.md Normal file
View file

@ -0,0 +1,11 @@
# Statistical scripts
Computes stats about a whole lot of stuff.
## Setup
```sh
virtualenv -p python3 venv # Do this only once
source venv/bin/activate # Do this for every new shell working running the script
pip install -r requirements.txt # Do this only once
```

0
stats/__init__.py Normal file
View file

82
stats/fde_stats.py Executable file
View file

@ -0,0 +1,82 @@
#!/usr/bin/env python3
import gather_stats
import argparse
import sys
class Config:
def __init__(self):
args = self.parse_args()
self._cores = args.cores
self.feature = args.feature
if args.feature == 'gather':
self.output = args.output
elif args.feature == 'analyze':
self.data_file = args.data_file
@property
def cores(self):
if self._cores <= 0:
return None
return self._cores
def parse_args(self):
parser = argparse.ArgumentParser(
description="Gather statistics about system-related ELFs")
parser.add_argument('--cores', '-j', default=1, type=int,
help=("Use N cores for processing. Defaults to "
"1. 0 to use up all cores."))
subparsers = parser.add_subparsers(help='Subcommands')
# Gather stats
parser_gather = subparsers.add_parser(
'gather',
help=('Gather system data into a file, to allow multiple '
'analyses without re-scanning the whole system.'))
parser_gather.set_defaults(feature='gather')
parser_gather.add_argument('--output', '-o',
default='elf_data',
help=('Output data to this file. Defaults '
'to "elf_data"'))
# Analyze stats
parser_analyze = subparsers.add_parser(
'analyze',
help='Analyze data gathered by a previous run.')
parser_analyze.set_defaults(feature='analyze')
parser_analyze.add_argument('data_file',
default='elf_data',
help=('Analyze this data file. Defaults '
'to "elf_data".'))
# TODO histogram?
out = parser.parse_args()
if 'feature' not in out:
print("No subcommand specified.", file=sys.stderr)
parser.print_usage(file=sys.stderr)
sys.exit(1)
return out
def main():
config = Config()
if config.feature == 'gather':
stats_accu = gather_stats.gather_system_files(config)
stats_accu.serialize(config.output)
elif config.feature == 'analyze':
# TODO
print("Not implemented", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

60
stats/gather_stats.py Normal file
View file

@ -0,0 +1,60 @@
from pyelftools_overlay import system_elfs
import pathos
import signal
import itertools
from stats_accu import StatsAccumulator
class FilesProcessor:
def __init__(self, cores, stats_accu=None):
self.stop_processing = False
self._processed_counter = itertools.count()
self.cores = cores
if stats_accu is None:
stats_accu = StatsAccumulator()
self.stats_accu = stats_accu
def stop_processing_now(self):
self.stop_processing = True
def next_counter(self):
return self._processed_counter.__next__()
def run(self, elf_list):
self.elf_count = len(elf_list)
with pathos.multiprocessing.ProcessPool(nodes=self.cores) as pool:
pool.map(self.process_single_file, elf_list)
def process_single_file(self, elf_path):
if self.stop_processing:
return
cur_file_count = self.next_counter()
print('> [{}/{} {:.0f}%] {}'.format(
cur_file_count, self.elf_count,
cur_file_count / self.elf_count * 100, elf_path))
self.stats_accu.process_file(elf_path)
def gather_system_files(config):
stats_accu = StatsAccumulator()
processor = FilesProcessor(config.cores, stats_accu)
def signal_graceful_exit(sig, frame):
''' Stop gracefully now '''
nonlocal processor
print("Stopping after this ELF…")
processor.stop_processing_now()
signal.signal(signal.SIGINT, signal_graceful_exit)
elf_list = []
for elf_path in system_elfs():
elf_list.append(elf_path)
processor.run(elf_list)
return stats_accu

View file

@ -0,0 +1,76 @@
""" Overlay of PyElfTools for quick access to what we want here """
from elftools.elf.elffile import ELFFile
from elftools.common.exceptions import ELFError, DWARFError
import os
def get_cfi(path):
''' Get the CFI entries from the ELF at the provided path '''
try:
with open(path, 'rb') as file_handle:
elf_file = ELFFile(file_handle)
if not elf_file.has_dwarf_info():
return None
dw_info = elf_file.get_dwarf_info()
if dw_info.has_CFI():
cfis = dw_info.CFI_entries()
elif dw_info.has_EH_CFI():
cfis = dw_info.EH_CFI_entries()
else:
return None
except ELFError:
return None
except DWARFError:
return None
except PermissionError:
return None
return cfis
def system_elfs():
''' Iterator over system libraries '''
def readlink_rec(path):
if not os.path.islink(path):
return path
return readlink_rec(
os.path.join(os.path.dirname(path),
os.readlink(path)))
sysbin_dirs = [
'/lib',
'/usr/lib',
'/usr/local/lib',
'/bin',
'/usr/bin',
'/usr/local/bin',
'/sbin',
]
to_explore = sysbin_dirs
seen_elfs = set()
while to_explore:
bindir = to_explore.pop()
if not os.path.isdir(bindir):
continue
for direntry in os.scandir(bindir):
if not direntry.is_file():
if direntry.is_dir():
to_explore.append(direntry.path)
continue
canonical_name = readlink_rec(direntry.path)
if canonical_name in seen_elfs:
continue
seen_elfs.add(canonical_name)
yield canonical_name

2
stats/requirements.txt Normal file
View file

@ -0,0 +1,2 @@
git+https://github.com/eliben/pyelftools
git+https://github.com/uqfoundation/pathos

177
stats/stats_accu.py Normal file
View file

@ -0,0 +1,177 @@
from elftools.dwarf import callframe
from pyelftools_overlay import get_cfi
from enum import Enum
import json
import subprocess
import re
from math import ceil
class ProportionFinder:
''' Finds figures such as median, etc. on the original structure of a
dictionnary mapping a value to its occurrence count '''
def __init__(self, count_per_value):
self.cumulative = []
prev_count = 0
for key in sorted(count_per_value.keys()):
n_count = prev_count + count_per_value[key]
self.cumulative.append(
(key, n_count))
prev_count = n_count
self.elem_count = prev_count
def find_at_proportion(self, proportion):
if not self.cumulative: # Empty list
return None
low_bound = ceil(self.elem_count * proportion)
def binsearch(beg, end):
med = ceil((beg + end) / 2)
if beg + 1 == end:
return self.cumulative[beg][0]
if self.cumulative[med - 1][1] < low_bound:
return binsearch(med, end)
return binsearch(beg, med)
return binsearch(0, len(self.cumulative))
def elf_so_deps(path):
''' Get the list of shared objects dependencies of the given ELF object.
This is obtained by running `ldd`. '''
deps_list = []
try:
ldd_output = subprocess.check_output(['/usr/bin/ldd', path]) \
.decode('utf-8')
ldd_re = re.compile(r'^.* => (.*) \(0x[0-9a-fA-F]*\)$')
ldd_lines = ldd_output.strip().split('\n')
for line in ldd_lines:
line = line.strip()
match = ldd_re.match(line)
if match is None:
continue # Just ignore that line — it might be eg. linux-vdso
deps_list.append(match.group(1))
return deps_list
except subprocess.CalledProcessError as exn:
raise Exception(
("Cannot get dependencies for {}: ldd terminated with exit code "
"{}.").format(path, exn.returncode))
class ElfType(Enum):
ELF_LIB = auto()
ELF_BINARY = auto()
class SingleFdeData:
def __init__(self, path, elf_type, data):
self.path = path
self.elf_type = elf_type
self.data = data
self.gather_deps()
def gather_deps(self):
""" Collect ldd data on the binary """
self.deps = elf_so_deps(self.path)
class StatsAccumulator:
def __init__(self):
self.elf_count = 0
self.fde_count = 0
self.fde_row_count = 0
self.fde_with_n_rows = {}
def serialize(self, path):
''' Save the gathered data to `stream` '''
notable_fields = [
'elf_count',
'fde_count',
'fde_row_count',
'fde_with_n_rows',
]
out = {}
for field in notable_fields:
out[field] = self.__dict__[field]
with open(path, 'wb') as stream:
json.dump(out, stream)
@staticmethod
def unserialize(path):
out = StatsAccumulator()
with open(path, 'wb') as stream:
data = json.load(stream)
for field in data:
out.field = data[field]
return out
def report(self):
''' Report on the statistics gathered '''
self.fde_rows_proportion = ProportionFinder(
self.fde_with_n_rows)
rows = [
("ELFs analyzed", self.elf_count),
("FDEs analyzed", self.fde_count),
("FDE rows analyzed", self.fde_row_count),
("Avg. rows per FDE", self.fde_row_count / self.fde_count),
("Median rows per FDE",
self.fde_rows_proportion.find_at_proportion(0.5)),
("Max rows per FDE", max(self.fde_with_n_rows.keys())),
]
title_size = max(map(lambda x: len(x[0]), rows))
line_format = "{:<" + str(title_size + 1) + "} {}"
for row in rows:
print(line_format.format(row[0], row[1]))
def process_file(self, path):
''' Process a single file '''
cfi = get_cfi(path)
if not cfi:
return
self.elf_count += 1
for entry in cfi:
if isinstance(entry, callframe.CIE): # Is a CIE
self.process_cie(entry)
elif isinstance(entry, callframe.FDE): # Is a FDE
self.process_fde(entry)
def incr_cell(self, table, key):
''' Increments table[key], or sets it to 1 if unset '''
if key in table:
table[key] += 1
else:
table[key] = 1
def process_cie(self, cie):
''' Process a CIE '''
pass # Nothing needed from a CIE
def process_fde(self, fde):
''' Process a FDE '''
self.fde_count += 1
decoded = fde.get_decoded()
row_count = len(decoded.table)
self.fde_row_count += row_count
self.incr_cell(self.fde_with_n_rows, row_count)