Add tentative WIP stats module

2018-07-10 14:41:33 +02:00 · 2018-07-10 14:41:33 +02:00 · 3cb2c508a0
commit 3cb2c508a0
parent d93d2c2f6e
8 changed files with 410 additions and 0 deletions
--- a/stats/.gitignore
+++ b/stats/.gitignore
@ -0,0 +1,2 @@
 venv
 elf_data
--- a/stats/README.md
+++ b/stats/README.md
@ -0,0 +1,11 @@
 # Statistical scripts
 Computes stats about a whole lot of stuff.
 ## Setup
 ```sh
  virtualenv -p python3 venv  # Do this only once
  source venv/bin/activate  # Do this for every new shell working running the script
  pip install -r requirements.txt  # Do this only once
 ```
--- a/stats/init.py
+++ b/stats/init.py
--- a/stats/fde_stats.py
+++ b/stats/fde_stats.py
@ -0,0 +1,82 @@
 #!/usr/bin/env python3
 import gather_stats
 import argparse
 import sys
 class Config:
    def __init__(self):
        args = self.parse_args()
        self._cores = args.cores
        self.feature = args.feature
        if args.feature == 'gather':
            self.output = args.output
        elif args.feature == 'analyze':
            self.data_file = args.data_file
    @property
    def cores(self):
        if self._cores <= 0:
            return None
        return self._cores
    def parse_args(self):
        parser = argparse.ArgumentParser(
            description="Gather statistics about system-related ELFs")
        parser.add_argument('--cores', '-j', default=1, type=int,
                            help=("Use N cores for processing. Defaults to "
                                  "1. 0 to use up all cores."))
        subparsers = parser.add_subparsers(help='Subcommands')
        # Gather stats
        parser_gather = subparsers.add_parser(
            'gather',
            help=('Gather system data into a file, to allow multiple '
                  'analyses without re-scanning the whole system.'))
        parser_gather.set_defaults(feature='gather')
        parser_gather.add_argument('--output', '-o',
                                   default='elf_data',
                                   help=('Output data to this file. Defaults '
                                         'to "elf_data"'))
        # Analyze stats
        parser_analyze = subparsers.add_parser(
            'analyze',
            help='Analyze data gathered by a previous run.')
        parser_analyze.set_defaults(feature='analyze')
        parser_analyze.add_argument('data_file',
                                    default='elf_data',
                                    help=('Analyze this data file. Defaults '
                                          'to "elf_data".'))
        # TODO histogram?
        out = parser.parse_args()
        if 'feature' not in out:
            print("No subcommand specified.", file=sys.stderr)
            parser.print_usage(file=sys.stderr)
            sys.exit(1)
        return out
 def main():
    config = Config()
    if config.feature == 'gather':
        stats_accu = gather_stats.gather_system_files(config)
        stats_accu.serialize(config.output)
    elif config.feature == 'analyze':
        # TODO
        print("Not implemented", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/stats/gather_stats.py
+++ b/stats/gather_stats.py
@ -0,0 +1,60 @@
 from pyelftools_overlay import system_elfs
 import pathos
 import signal
 import itertools
 from stats_accu import StatsAccumulator
 class FilesProcessor:
    def __init__(self, cores, stats_accu=None):
        self.stop_processing = False
        self._processed_counter = itertools.count()
        self.cores = cores
        if stats_accu is None:
            stats_accu = StatsAccumulator()
        self.stats_accu = stats_accu
    def stop_processing_now(self):
        self.stop_processing = True
    def next_counter(self):
        return self._processed_counter.__next__()
    def run(self, elf_list):
        self.elf_count = len(elf_list)
        with pathos.multiprocessing.ProcessPool(nodes=self.cores) as pool:
            pool.map(self.process_single_file, elf_list)
    def process_single_file(self, elf_path):
        if self.stop_processing:
            return
        cur_file_count = self.next_counter()
        print('> [{}/{} {:.0f}%] {}'.format(
            cur_file_count, self.elf_count,
            cur_file_count / self.elf_count * 100, elf_path))
        self.stats_accu.process_file(elf_path)
 def gather_system_files(config):
    stats_accu = StatsAccumulator()
    processor = FilesProcessor(config.cores, stats_accu)
    def signal_graceful_exit(sig, frame):
        ''' Stop gracefully now '''
        nonlocal processor
        print("Stopping after this ELF…")
        processor.stop_processing_now()
    signal.signal(signal.SIGINT, signal_graceful_exit)
    elf_list = []
    for elf_path in system_elfs():
        elf_list.append(elf_path)
    processor.run(elf_list)
    return stats_accu
--- a/stats/pyelftools_overlay.py
+++ b/stats/pyelftools_overlay.py
@ -0,0 +1,76 @@
 """ Overlay of PyElfTools for quick access to what we want here """
 from elftools.elf.elffile import ELFFile
 from elftools.common.exceptions import ELFError, DWARFError
 import os
 def get_cfi(path):
    ''' Get the CFI entries from the ELF at the provided path '''
    try:
        with open(path, 'rb') as file_handle:
            elf_file = ELFFile(file_handle)
            if not elf_file.has_dwarf_info():
                return None
            dw_info = elf_file.get_dwarf_info()
            if dw_info.has_CFI():
                cfis = dw_info.CFI_entries()
            elif dw_info.has_EH_CFI():
                cfis = dw_info.EH_CFI_entries()
            else:
                return None
    except ELFError:
        return None
    except DWARFError:
        return None
    except PermissionError:
        return None
    return cfis
 def system_elfs():
    ''' Iterator over system libraries '''
    def readlink_rec(path):
        if not os.path.islink(path):
            return path
        return readlink_rec(
            os.path.join(os.path.dirname(path),
                         os.readlink(path)))
    sysbin_dirs = [
        '/lib',
        '/usr/lib',
        '/usr/local/lib',
        '/bin',
        '/usr/bin',
        '/usr/local/bin',
        '/sbin',
    ]
    to_explore = sysbin_dirs
    seen_elfs = set()
    while to_explore:
        bindir = to_explore.pop()
        if not os.path.isdir(bindir):
            continue
        for direntry in os.scandir(bindir):
            if not direntry.is_file():
                if direntry.is_dir():
                    to_explore.append(direntry.path)
                continue
            canonical_name = readlink_rec(direntry.path)
            if canonical_name in seen_elfs:
                continue
            seen_elfs.add(canonical_name)
            yield canonical_name
--- a/stats/requirements.txt
+++ b/stats/requirements.txt
@ -0,0 +1,2 @@
 git+https://github.com/eliben/pyelftools
 git+https://github.com/uqfoundation/pathos
--- a/stats/stats_accu.py
+++ b/stats/stats_accu.py
@ -0,0 +1,177 @@
 from elftools.dwarf import callframe
 from pyelftools_overlay import get_cfi
 from enum import Enum
 import json
 import subprocess
 import re
 from math import ceil
 class ProportionFinder:
    ''' Finds figures such as median, etc. on the original structure of a
    dictionnary mapping a value to its occurrence count '''
    def __init__(self, count_per_value):
        self.cumulative = []
        prev_count = 0
        for key in sorted(count_per_value.keys()):
            n_count = prev_count + count_per_value[key]
            self.cumulative.append(
                (key, n_count))
            prev_count = n_count
        self.elem_count = prev_count
    def find_at_proportion(self, proportion):
        if not self.cumulative:  # Empty list
            return None
        low_bound = ceil(self.elem_count * proportion)
        def binsearch(beg, end):
            med = ceil((beg + end) / 2)
            if beg + 1 == end:
                return self.cumulative[beg][0]
            if self.cumulative[med - 1][1] < low_bound:
                return binsearch(med, end)
            return binsearch(beg, med)
        return binsearch(0, len(self.cumulative))
 def elf_so_deps(path):
    ''' Get the list of shared objects dependencies of the given ELF object.
    This is obtained by running `ldd`. '''
    deps_list = []
    try:
        ldd_output = subprocess.check_output(['/usr/bin/ldd', path]) \
            .decode('utf-8')
        ldd_re = re.compile(r'^.* => (.*) \(0x[0-9a-fA-F]*\)$')
        ldd_lines = ldd_output.strip().split('\n')
        for line in ldd_lines:
            line = line.strip()
            match = ldd_re.match(line)
            if match is None:
                continue  # Just ignore that line — it might be eg. linux-vdso
            deps_list.append(match.group(1))
        return deps_list
    except subprocess.CalledProcessError as exn:
        raise Exception(
            ("Cannot get dependencies for {}: ldd terminated with exit code "
             "{}.").format(path, exn.returncode))
 class ElfType(Enum):
    ELF_LIB = auto()
    ELF_BINARY = auto()
 class SingleFdeData:
    def __init__(self, path, elf_type, data):
        self.path = path
        self.elf_type = elf_type
        self.data = data
        self.gather_deps()
    def gather_deps(self):
        """ Collect ldd data on the binary """
        self.deps = elf_so_deps(self.path)
 class StatsAccumulator:
    def __init__(self):
        self.elf_count = 0
        self.fde_count = 0
        self.fde_row_count = 0
        self.fde_with_n_rows = {}
    def serialize(self, path):
        ''' Save the gathered data to `stream` '''
        notable_fields = [
            'elf_count',
            'fde_count',
            'fde_row_count',
            'fde_with_n_rows',
        ]
        out = {}
        for field in notable_fields:
            out[field] = self.__dict__[field]
        with open(path, 'wb') as stream:
            json.dump(out, stream)
    @staticmethod
    def unserialize(path):
        out = StatsAccumulator()
        with open(path, 'wb') as stream:
            data = json.load(stream)
        for field in data:
            out.field = data[field]
        return out
    def report(self):
        ''' Report on the statistics gathered '''
        self.fde_rows_proportion = ProportionFinder(
            self.fde_with_n_rows)
        rows = [
            ("ELFs analyzed", self.elf_count),
            ("FDEs analyzed", self.fde_count),
            ("FDE rows analyzed", self.fde_row_count),
            ("Avg. rows per FDE", self.fde_row_count / self.fde_count),
            ("Median rows per FDE",
             self.fde_rows_proportion.find_at_proportion(0.5)),
            ("Max rows per FDE", max(self.fde_with_n_rows.keys())),
        ]
        title_size = max(map(lambda x: len(x[0]), rows))
        line_format = "{:<" + str(title_size + 1) + "} {}"
        for row in rows:
            print(line_format.format(row[0], row[1]))
    def process_file(self, path):
        ''' Process a single file '''
        cfi = get_cfi(path)
        if not cfi:
            return
        self.elf_count += 1
        for entry in cfi:
            if isinstance(entry, callframe.CIE):  # Is a CIE
                self.process_cie(entry)
            elif isinstance(entry, callframe.FDE):  # Is a FDE
                self.process_fde(entry)
    def incr_cell(self, table, key):
        ''' Increments table[key], or sets it to 1 if unset '''
        if key in table:
            table[key] += 1
        else:
            table[key] = 1
    def process_cie(self, cie):
        ''' Process a CIE '''
        pass  # Nothing needed from a CIE
    def process_fde(self, fde):
        ''' Process a FDE '''
        self.fde_count += 1
        decoded = fde.get_decoded()
        row_count = len(decoded.table)
        self.fde_row_count += row_count
        self.incr_cell(self.fde_with_n_rows, row_count)
		`@ -0,0 +1,2 @@`
							`git+https://github.com/eliben/pyelftools`
							`git+https://github.com/uqfoundation/pathos`