From 3cb2c508a08ca45fc358cc28f523ff63bc3e846e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Tue, 10 Jul 2018 14:41:33 +0200 Subject: [PATCH] Add tentative WIP stats module --- stats/.gitignore | 2 + stats/README.md | 11 +++ stats/__init__.py | 0 stats/fde_stats.py | 82 +++++++++++++++++ stats/gather_stats.py | 60 ++++++++++++ stats/pyelftools_overlay.py | 76 ++++++++++++++++ stats/requirements.txt | 2 + stats/stats_accu.py | 177 ++++++++++++++++++++++++++++++++++++ 8 files changed, 410 insertions(+) create mode 100644 stats/.gitignore create mode 100644 stats/README.md create mode 100644 stats/__init__.py create mode 100755 stats/fde_stats.py create mode 100644 stats/gather_stats.py create mode 100644 stats/pyelftools_overlay.py create mode 100644 stats/requirements.txt create mode 100644 stats/stats_accu.py diff --git a/stats/.gitignore b/stats/.gitignore new file mode 100644 index 0000000..7a12b44 --- /dev/null +++ b/stats/.gitignore @@ -0,0 +1,2 @@ +venv +elf_data diff --git a/stats/README.md b/stats/README.md new file mode 100644 index 0000000..33fef94 --- /dev/null +++ b/stats/README.md @@ -0,0 +1,11 @@ +# Statistical scripts + +Computes stats about a whole lot of stuff. + +## Setup + +```sh + virtualenv -p python3 venv # Do this only once + source venv/bin/activate # Do this for every new shell working running the script + pip install -r requirements.txt # Do this only once +``` diff --git a/stats/__init__.py b/stats/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/stats/fde_stats.py b/stats/fde_stats.py new file mode 100755 index 0000000..38d384c --- /dev/null +++ b/stats/fde_stats.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +import gather_stats + +import argparse +import sys + + +class Config: + def __init__(self): + args = self.parse_args() + self._cores = args.cores + self.feature = args.feature + + if args.feature == 'gather': + self.output = args.output + + elif args.feature == 'analyze': + self.data_file = args.data_file + + @property + def cores(self): + if self._cores <= 0: + return None + return self._cores + + def parse_args(self): + parser = argparse.ArgumentParser( + description="Gather statistics about system-related ELFs") + + parser.add_argument('--cores', '-j', default=1, type=int, + help=("Use N cores for processing. Defaults to " + "1. 0 to use up all cores.")) + + subparsers = parser.add_subparsers(help='Subcommands') + + # Gather stats + parser_gather = subparsers.add_parser( + 'gather', + help=('Gather system data into a file, to allow multiple ' + 'analyses without re-scanning the whole system.')) + parser_gather.set_defaults(feature='gather') + parser_gather.add_argument('--output', '-o', + default='elf_data', + help=('Output data to this file. Defaults ' + 'to "elf_data"')) + + # Analyze stats + parser_analyze = subparsers.add_parser( + 'analyze', + help='Analyze data gathered by a previous run.') + parser_analyze.set_defaults(feature='analyze') + parser_analyze.add_argument('data_file', + default='elf_data', + help=('Analyze this data file. Defaults ' + 'to "elf_data".')) + # TODO histogram? + + out = parser.parse_args() + if 'feature' not in out: + print("No subcommand specified.", file=sys.stderr) + parser.print_usage(file=sys.stderr) + sys.exit(1) + + return out + + +def main(): + config = Config() + + if config.feature == 'gather': + stats_accu = gather_stats.gather_system_files(config) + stats_accu.serialize(config.output) + + elif config.feature == 'analyze': + # TODO + print("Not implemented", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/stats/gather_stats.py b/stats/gather_stats.py new file mode 100644 index 0000000..cc63197 --- /dev/null +++ b/stats/gather_stats.py @@ -0,0 +1,60 @@ +from pyelftools_overlay import system_elfs +import pathos +import signal +import itertools + +from stats_accu import StatsAccumulator + + +class FilesProcessor: + def __init__(self, cores, stats_accu=None): + self.stop_processing = False + self._processed_counter = itertools.count() + self.cores = cores + + if stats_accu is None: + stats_accu = StatsAccumulator() + self.stats_accu = stats_accu + + def stop_processing_now(self): + self.stop_processing = True + + def next_counter(self): + return self._processed_counter.__next__() + + def run(self, elf_list): + self.elf_count = len(elf_list) + with pathos.multiprocessing.ProcessPool(nodes=self.cores) as pool: + pool.map(self.process_single_file, elf_list) + + def process_single_file(self, elf_path): + if self.stop_processing: + return + + cur_file_count = self.next_counter() + print('> [{}/{} {:.0f}%] {}'.format( + cur_file_count, self.elf_count, + cur_file_count / self.elf_count * 100, elf_path)) + self.stats_accu.process_file(elf_path) + + +def gather_system_files(config): + stats_accu = StatsAccumulator() + processor = FilesProcessor(config.cores, stats_accu) + + def signal_graceful_exit(sig, frame): + ''' Stop gracefully now ''' + nonlocal processor + + print("Stopping after this ELF…") + processor.stop_processing_now() + + signal.signal(signal.SIGINT, signal_graceful_exit) + + elf_list = [] + for elf_path in system_elfs(): + elf_list.append(elf_path) + + processor.run(elf_list) + + return stats_accu diff --git a/stats/pyelftools_overlay.py b/stats/pyelftools_overlay.py new file mode 100644 index 0000000..76c7d04 --- /dev/null +++ b/stats/pyelftools_overlay.py @@ -0,0 +1,76 @@ +""" Overlay of PyElfTools for quick access to what we want here """ + +from elftools.elf.elffile import ELFFile +from elftools.common.exceptions import ELFError, DWARFError +import os + + +def get_cfi(path): + ''' Get the CFI entries from the ELF at the provided path ''' + + try: + with open(path, 'rb') as file_handle: + elf_file = ELFFile(file_handle) + + if not elf_file.has_dwarf_info(): + return None + + dw_info = elf_file.get_dwarf_info() + if dw_info.has_CFI(): + cfis = dw_info.CFI_entries() + elif dw_info.has_EH_CFI(): + cfis = dw_info.EH_CFI_entries() + else: + return None + except ELFError: + return None + except DWARFError: + return None + except PermissionError: + return None + + return cfis + + +def system_elfs(): + ''' Iterator over system libraries ''' + + def readlink_rec(path): + if not os.path.islink(path): + return path + + return readlink_rec( + os.path.join(os.path.dirname(path), + os.readlink(path))) + + sysbin_dirs = [ + '/lib', + '/usr/lib', + '/usr/local/lib', + '/bin', + '/usr/bin', + '/usr/local/bin', + '/sbin', + ] + to_explore = sysbin_dirs + + seen_elfs = set() + + while to_explore: + bindir = to_explore.pop() + + if not os.path.isdir(bindir): + continue + + for direntry in os.scandir(bindir): + if not direntry.is_file(): + if direntry.is_dir(): + to_explore.append(direntry.path) + continue + + canonical_name = readlink_rec(direntry.path) + if canonical_name in seen_elfs: + continue + + seen_elfs.add(canonical_name) + yield canonical_name diff --git a/stats/requirements.txt b/stats/requirements.txt new file mode 100644 index 0000000..545f4bd --- /dev/null +++ b/stats/requirements.txt @@ -0,0 +1,2 @@ +git+https://github.com/eliben/pyelftools +git+https://github.com/uqfoundation/pathos diff --git a/stats/stats_accu.py b/stats/stats_accu.py new file mode 100644 index 0000000..3a2995c --- /dev/null +++ b/stats/stats_accu.py @@ -0,0 +1,177 @@ +from elftools.dwarf import callframe +from pyelftools_overlay import get_cfi +from enum import Enum +import json +import subprocess +import re + +from math import ceil + + +class ProportionFinder: + ''' Finds figures such as median, etc. on the original structure of a + dictionnary mapping a value to its occurrence count ''' + + def __init__(self, count_per_value): + self.cumulative = [] + prev_count = 0 + for key in sorted(count_per_value.keys()): + n_count = prev_count + count_per_value[key] + self.cumulative.append( + (key, n_count)) + prev_count = n_count + + self.elem_count = prev_count + + def find_at_proportion(self, proportion): + if not self.cumulative: # Empty list + return None + + low_bound = ceil(self.elem_count * proportion) + + def binsearch(beg, end): + med = ceil((beg + end) / 2) + + if beg + 1 == end: + return self.cumulative[beg][0] + + if self.cumulative[med - 1][1] < low_bound: + return binsearch(med, end) + return binsearch(beg, med) + + return binsearch(0, len(self.cumulative)) + + +def elf_so_deps(path): + ''' Get the list of shared objects dependencies of the given ELF object. + This is obtained by running `ldd`. ''' + + deps_list = [] + + try: + ldd_output = subprocess.check_output(['/usr/bin/ldd', path]) \ + .decode('utf-8') + ldd_re = re.compile(r'^.* => (.*) \(0x[0-9a-fA-F]*\)$') + + ldd_lines = ldd_output.strip().split('\n') + for line in ldd_lines: + line = line.strip() + match = ldd_re.match(line) + if match is None: + continue # Just ignore that line — it might be eg. linux-vdso + deps_list.append(match.group(1)) + + return deps_list + + except subprocess.CalledProcessError as exn: + raise Exception( + ("Cannot get dependencies for {}: ldd terminated with exit code " + "{}.").format(path, exn.returncode)) + + +class ElfType(Enum): + ELF_LIB = auto() + ELF_BINARY = auto() + + +class SingleFdeData: + def __init__(self, path, elf_type, data): + self.path = path + self.elf_type = elf_type + self.data = data + + self.gather_deps() + + def gather_deps(self): + """ Collect ldd data on the binary """ + self.deps = elf_so_deps(self.path) + + +class StatsAccumulator: + def __init__(self): + self.elf_count = 0 + self.fde_count = 0 + self.fde_row_count = 0 + self.fde_with_n_rows = {} + + def serialize(self, path): + ''' Save the gathered data to `stream` ''' + + notable_fields = [ + 'elf_count', + 'fde_count', + 'fde_row_count', + 'fde_with_n_rows', + ] + out = {} + for field in notable_fields: + out[field] = self.__dict__[field] + + with open(path, 'wb') as stream: + json.dump(out, stream) + + @staticmethod + def unserialize(path): + out = StatsAccumulator() + with open(path, 'wb') as stream: + data = json.load(stream) + for field in data: + out.field = data[field] + return out + + def report(self): + ''' Report on the statistics gathered ''' + + self.fde_rows_proportion = ProportionFinder( + self.fde_with_n_rows) + + rows = [ + ("ELFs analyzed", self.elf_count), + ("FDEs analyzed", self.fde_count), + ("FDE rows analyzed", self.fde_row_count), + ("Avg. rows per FDE", self.fde_row_count / self.fde_count), + ("Median rows per FDE", + self.fde_rows_proportion.find_at_proportion(0.5)), + ("Max rows per FDE", max(self.fde_with_n_rows.keys())), + ] + + title_size = max(map(lambda x: len(x[0]), rows)) + line_format = "{:<" + str(title_size + 1) + "} {}" + + for row in rows: + print(line_format.format(row[0], row[1])) + + def process_file(self, path): + ''' Process a single file ''' + + cfi = get_cfi(path) + if not cfi: + return + + self.elf_count += 1 + + for entry in cfi: + if isinstance(entry, callframe.CIE): # Is a CIE + self.process_cie(entry) + elif isinstance(entry, callframe.FDE): # Is a FDE + self.process_fde(entry) + + def incr_cell(self, table, key): + ''' Increments table[key], or sets it to 1 if unset ''' + if key in table: + table[key] += 1 + else: + table[key] = 1 + + def process_cie(self, cie): + ''' Process a CIE ''' + pass # Nothing needed from a CIE + + def process_fde(self, fde): + ''' Process a FDE ''' + self.fde_count += 1 + + decoded = fde.get_decoded() + row_count = len(decoded.table) + self.fde_row_count += row_count + self.incr_cell(self.fde_with_n_rows, row_count)