From 3cb2c508a08ca45fc358cc28f523ff63bc3e846e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Tue, 10 Jul 2018 14:41:33 +0200
Subject: [PATCH] Add tentative WIP stats module

---
 stats/.gitignore            |   2 +
 stats/README.md             |  11 +++
 stats/__init__.py           |   0
 stats/fde_stats.py          |  82 +++++++++++++++++
 stats/gather_stats.py       |  60 ++++++++++++
 stats/pyelftools_overlay.py |  76 ++++++++++++++++
 stats/requirements.txt      |   2 +
 stats/stats_accu.py         | 177 ++++++++++++++++++++++++++++++++++++
 8 files changed, 410 insertions(+)
 create mode 100644 stats/.gitignore
 create mode 100644 stats/README.md
 create mode 100644 stats/__init__.py
 create mode 100755 stats/fde_stats.py
 create mode 100644 stats/gather_stats.py
 create mode 100644 stats/pyelftools_overlay.py
 create mode 100644 stats/requirements.txt
 create mode 100644 stats/stats_accu.py

diff --git a/stats/.gitignore b/stats/.gitignore
new file mode 100644
index 0000000..7a12b44
--- /dev/null
+++ b/stats/.gitignore
@@ -0,0 +1,2 @@
+venv
+elf_data
diff --git a/stats/README.md b/stats/README.md
new file mode 100644
index 0000000..33fef94
--- /dev/null
+++ b/stats/README.md
@@ -0,0 +1,11 @@
+# Statistical scripts
+
+Computes stats about a whole lot of stuff.
+
+## Setup
+
+```sh
+  virtualenv -p python3 venv  # Do this only once
+  source venv/bin/activate  # Do this for every new shell working running the script
+  pip install -r requirements.txt  # Do this only once
+```
diff --git a/stats/__init__.py b/stats/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/stats/fde_stats.py b/stats/fde_stats.py
new file mode 100755
index 0000000..38d384c
--- /dev/null
+++ b/stats/fde_stats.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import gather_stats
+
+import argparse
+import sys
+
+
+class Config:
+    def __init__(self):
+        args = self.parse_args()
+        self._cores = args.cores
+        self.feature = args.feature
+
+        if args.feature == 'gather':
+            self.output = args.output
+
+        elif args.feature == 'analyze':
+            self.data_file = args.data_file
+
+    @property
+    def cores(self):
+        if self._cores <= 0:
+            return None
+        return self._cores
+
+    def parse_args(self):
+        parser = argparse.ArgumentParser(
+            description="Gather statistics about system-related ELFs")
+
+        parser.add_argument('--cores', '-j', default=1, type=int,
+                            help=("Use N cores for processing. Defaults to "
+                                  "1. 0 to use up all cores."))
+
+        subparsers = parser.add_subparsers(help='Subcommands')
+
+        # Gather stats
+        parser_gather = subparsers.add_parser(
+            'gather',
+            help=('Gather system data into a file, to allow multiple '
+                  'analyses without re-scanning the whole system.'))
+        parser_gather.set_defaults(feature='gather')
+        parser_gather.add_argument('--output', '-o',
+                                   default='elf_data',
+                                   help=('Output data to this file. Defaults '
+                                         'to "elf_data"'))
+
+        # Analyze stats
+        parser_analyze = subparsers.add_parser(
+            'analyze',
+            help='Analyze data gathered by a previous run.')
+        parser_analyze.set_defaults(feature='analyze')
+        parser_analyze.add_argument('data_file',
+                                    default='elf_data',
+                                    help=('Analyze this data file. Defaults '
+                                          'to "elf_data".'))
+        # TODO histogram?
+
+        out = parser.parse_args()
+        if 'feature' not in out:
+            print("No subcommand specified.", file=sys.stderr)
+            parser.print_usage(file=sys.stderr)
+            sys.exit(1)
+
+        return out
+
+
+def main():
+    config = Config()
+
+    if config.feature == 'gather':
+        stats_accu = gather_stats.gather_system_files(config)
+        stats_accu.serialize(config.output)
+
+    elif config.feature == 'analyze':
+        # TODO
+        print("Not implemented", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/stats/gather_stats.py b/stats/gather_stats.py
new file mode 100644
index 0000000..cc63197
--- /dev/null
+++ b/stats/gather_stats.py
@@ -0,0 +1,60 @@
+from pyelftools_overlay import system_elfs
+import pathos
+import signal
+import itertools
+
+from stats_accu import StatsAccumulator
+
+
+class FilesProcessor:
+    def __init__(self, cores, stats_accu=None):
+        self.stop_processing = False
+        self._processed_counter = itertools.count()
+        self.cores = cores
+
+        if stats_accu is None:
+            stats_accu = StatsAccumulator()
+        self.stats_accu = stats_accu
+
+    def stop_processing_now(self):
+        self.stop_processing = True
+
+    def next_counter(self):
+        return self._processed_counter.__next__()
+
+    def run(self, elf_list):
+        self.elf_count = len(elf_list)
+        with pathos.multiprocessing.ProcessPool(nodes=self.cores) as pool:
+            pool.map(self.process_single_file, elf_list)
+
+    def process_single_file(self, elf_path):
+        if self.stop_processing:
+            return
+
+        cur_file_count = self.next_counter()
+        print('> [{}/{} {:.0f}%] {}'.format(
+            cur_file_count, self.elf_count,
+            cur_file_count / self.elf_count * 100, elf_path))
+        self.stats_accu.process_file(elf_path)
+
+
+def gather_system_files(config):
+    stats_accu = StatsAccumulator()
+    processor = FilesProcessor(config.cores, stats_accu)
+
+    def signal_graceful_exit(sig, frame):
+        ''' Stop gracefully now '''
+        nonlocal processor
+
+        print("Stopping after this ELF…")
+        processor.stop_processing_now()
+
+    signal.signal(signal.SIGINT, signal_graceful_exit)
+
+    elf_list = []
+    for elf_path in system_elfs():
+        elf_list.append(elf_path)
+
+    processor.run(elf_list)
+
+    return stats_accu
diff --git a/stats/pyelftools_overlay.py b/stats/pyelftools_overlay.py
new file mode 100644
index 0000000..76c7d04
--- /dev/null
+++ b/stats/pyelftools_overlay.py
@@ -0,0 +1,76 @@
+""" Overlay of PyElfTools for quick access to what we want here """
+
+from elftools.elf.elffile import ELFFile
+from elftools.common.exceptions import ELFError, DWARFError
+import os
+
+
+def get_cfi(path):
+    ''' Get the CFI entries from the ELF at the provided path '''
+
+    try:
+        with open(path, 'rb') as file_handle:
+            elf_file = ELFFile(file_handle)
+
+            if not elf_file.has_dwarf_info():
+                return None
+
+            dw_info = elf_file.get_dwarf_info()
+            if dw_info.has_CFI():
+                cfis = dw_info.CFI_entries()
+            elif dw_info.has_EH_CFI():
+                cfis = dw_info.EH_CFI_entries()
+            else:
+                return None
+    except ELFError:
+        return None
+    except DWARFError:
+        return None
+    except PermissionError:
+        return None
+
+    return cfis
+
+
+def system_elfs():
+    ''' Iterator over system libraries '''
+
+    def readlink_rec(path):
+        if not os.path.islink(path):
+            return path
+
+        return readlink_rec(
+            os.path.join(os.path.dirname(path),
+                         os.readlink(path)))
+
+    sysbin_dirs = [
+        '/lib',
+        '/usr/lib',
+        '/usr/local/lib',
+        '/bin',
+        '/usr/bin',
+        '/usr/local/bin',
+        '/sbin',
+    ]
+    to_explore = sysbin_dirs
+
+    seen_elfs = set()
+
+    while to_explore:
+        bindir = to_explore.pop()
+
+        if not os.path.isdir(bindir):
+            continue
+
+        for direntry in os.scandir(bindir):
+            if not direntry.is_file():
+                if direntry.is_dir():
+                    to_explore.append(direntry.path)
+                continue
+
+            canonical_name = readlink_rec(direntry.path)
+            if canonical_name in seen_elfs:
+                continue
+
+            seen_elfs.add(canonical_name)
+            yield canonical_name
diff --git a/stats/requirements.txt b/stats/requirements.txt
new file mode 100644
index 0000000..545f4bd
--- /dev/null
+++ b/stats/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/eliben/pyelftools
+git+https://github.com/uqfoundation/pathos
diff --git a/stats/stats_accu.py b/stats/stats_accu.py
new file mode 100644
index 0000000..3a2995c
--- /dev/null
+++ b/stats/stats_accu.py
@@ -0,0 +1,177 @@
+from elftools.dwarf import callframe
+from pyelftools_overlay import get_cfi
+from enum import Enum
+import json
+import subprocess
+import re
+
+from math import ceil
+
+
+class ProportionFinder:
+    ''' Finds figures such as median, etc. on the original structure of a
+    dictionnary mapping a value to its occurrence count '''
+
+    def __init__(self, count_per_value):
+        self.cumulative = []
+        prev_count = 0
+        for key in sorted(count_per_value.keys()):
+            n_count = prev_count + count_per_value[key]
+            self.cumulative.append(
+                (key, n_count))
+            prev_count = n_count
+
+        self.elem_count = prev_count
+
+    def find_at_proportion(self, proportion):
+        if not self.cumulative:  # Empty list
+            return None
+
+        low_bound = ceil(self.elem_count * proportion)
+
+        def binsearch(beg, end):
+            med = ceil((beg + end) / 2)
+
+            if beg + 1 == end:
+                return self.cumulative[beg][0]
+
+            if self.cumulative[med - 1][1] < low_bound:
+                return binsearch(med, end)
+            return binsearch(beg, med)
+
+        return binsearch(0, len(self.cumulative))
+
+
+def elf_so_deps(path):
+    ''' Get the list of shared objects dependencies of the given ELF object.
+    This is obtained by running `ldd`. '''
+
+    deps_list = []
+
+    try:
+        ldd_output = subprocess.check_output(['/usr/bin/ldd', path]) \
+            .decode('utf-8')
+        ldd_re = re.compile(r'^.* => (.*) \(0x[0-9a-fA-F]*\)$')
+
+        ldd_lines = ldd_output.strip().split('\n')
+        for line in ldd_lines:
+            line = line.strip()
+            match = ldd_re.match(line)
+            if match is None:
+                continue  # Just ignore that line — it might be eg. linux-vdso
+            deps_list.append(match.group(1))
+
+        return deps_list
+
+    except subprocess.CalledProcessError as exn:
+        raise Exception(
+            ("Cannot get dependencies for {}: ldd terminated with exit code "
+             "{}.").format(path, exn.returncode))
+
+
+class ElfType(Enum):
+    ELF_LIB = auto()
+    ELF_BINARY = auto()
+
+
+class SingleFdeData:
+    def __init__(self, path, elf_type, data):
+        self.path = path
+        self.elf_type = elf_type
+        self.data = data
+
+        self.gather_deps()
+
+    def gather_deps(self):
+        """ Collect ldd data on the binary """
+        self.deps = elf_so_deps(self.path)
+
+
+class StatsAccumulator:
+    def __init__(self):
+        self.elf_count = 0
+        self.fde_count = 0
+        self.fde_row_count = 0
+        self.fde_with_n_rows = {}
+
+    def serialize(self, path):
+        ''' Save the gathered data to `stream` '''
+
+        notable_fields = [
+            'elf_count',
+            'fde_count',
+            'fde_row_count',
+            'fde_with_n_rows',
+        ]
+        out = {}
+        for field in notable_fields:
+            out[field] = self.__dict__[field]
+
+        with open(path, 'wb') as stream:
+            json.dump(out, stream)
+
+    @staticmethod
+    def unserialize(path):
+        out = StatsAccumulator()
+        with open(path, 'wb') as stream:
+            data = json.load(stream)
+        for field in data:
+            out.field = data[field]
+        return out
+
+    def report(self):
+        ''' Report on the statistics gathered '''
+
+        self.fde_rows_proportion = ProportionFinder(
+            self.fde_with_n_rows)
+
+        rows = [
+            ("ELFs analyzed", self.elf_count),
+            ("FDEs analyzed", self.fde_count),
+            ("FDE rows analyzed", self.fde_row_count),
+            ("Avg. rows per FDE", self.fde_row_count / self.fde_count),
+            ("Median rows per FDE",
+             self.fde_rows_proportion.find_at_proportion(0.5)),
+            ("Max rows per FDE", max(self.fde_with_n_rows.keys())),
+        ]
+
+        title_size = max(map(lambda x: len(x[0]), rows))
+        line_format = "{:<" + str(title_size + 1) + "} {}"
+
+        for row in rows:
+            print(line_format.format(row[0], row[1]))
+
+    def process_file(self, path):
+        ''' Process a single file '''
+
+        cfi = get_cfi(path)
+        if not cfi:
+            return
+
+        self.elf_count += 1
+
+        for entry in cfi:
+            if isinstance(entry, callframe.CIE):  # Is a CIE
+                self.process_cie(entry)
+            elif isinstance(entry, callframe.FDE):  # Is a FDE
+                self.process_fde(entry)
+
+    def incr_cell(self, table, key):
+        ''' Increments table[key], or sets it to 1 if unset '''
+        if key in table:
+            table[key] += 1
+        else:
+            table[key] = 1
+
+    def process_cie(self, cie):
+        ''' Process a CIE '''
+        pass  # Nothing needed from a CIE
+
+    def process_fde(self, fde):
+        ''' Process a FDE '''
+        self.fde_count += 1
+
+        decoded = fde.get_decoded()
+        row_count = len(decoded.table)
+        self.fde_row_count += row_count
+        self.incr_cell(self.fde_with_n_rows, row_count)