stats: various modifications

2018-08-08 14:31:05 +02:00 · 2018-08-08 14:31:05 +02:00 · 6629de9a3e
commit 6629de9a3e
parent 216e442f5b
7 changed files with 369 additions and 142 deletions
--- a/stats/.gitignore
+++ b/stats/.gitignore
@ -1,2 +1,3 @@
 venv
-elf_data
+elf_data*
+gathered
--- a/stats/fde_stats.py
+++ b/stats/fde_stats.py
@ -18,6 +18,7 @@ class Config:

        elif args.feature == 'sample':
            self.size = int(args.size)
+            self.output = args.output

        elif args.feature == 'analyze':
            self.data_file = args.data_file
@ -93,9 +94,9 @@ def main():
        stats_accu = gather_stats.gather_system_files(
            config,
            sample_size=config.size)
+        stats_accu.dump(config.output)

    elif config.feature == 'analyze':
-        # TODO
        print("Not implemented", file=sys.stderr)
        stats_accu = StatsAccumulator.load(config.data_file)
        sys.exit(1)
--- a/stats/gather_stats.py
+++ b/stats/gather_stats.py
@ -1,122 +1,98 @@
+from elftools.common.exceptions import DWARFError
 from pyelftools_overlay import system_elfs, get_cfi
 from elftools.dwarf import callframe
-import multiprocessing
-import signal
+import concurrent.futures
 import random

+
 from stats_accu import \
-    StatsAccumulator, SingleFdeData, \
-    RegsList, FdeData, DwarfInstr
+    StatsAccumulator, SingleFdeData, FdeData, DwarfInstr


-class FilesProcessor(multiprocessing.Process):
-    def __init__(self, elf_list, shared_queue):
-        super().__init__()
-        self.stop_processing = False
-        self.processed_counter = 0
-        self.elf_list = elf_list
-        self.shared_queue = shared_queue
+class ProcessWrapper:
+    def __init__(self, fct):
+        self._fct = fct

-    def stop_processing_now(self):
-        self.stop_processing = True
+    def __call__(self, elf_descr):
+        try:
+            path, elftype = elf_descr

-    def run(self):
-        pos = 0
-        for descr in self.elf_list:
-            if self.stop_processing:
-                break
-            self.process_single_file(descr, pos)
-            pos += 1
+            print("Processing {}…".format(path))

-        print("=== Finished {} ===".format(self.name))
-        return 0
+            cfi = get_cfi(path)
+            if not cfi:
+                return None

-    def process_single_file(self, elf_descr, pos_in_list):
-        if self.stop_processing:
-            return
-
-        elf_path, elf_type = elf_descr
-
-        self.processed_counter += 1
-        print('[{}, {}/{}] {}'.format(
-            self.shared_queue.qsize(),
-            pos_in_list + 1,
-            len(self.elf_list),
-            elf_path))
-        self.process_file(elf_path, elf_type)
-
-    def process_file(self, path, elftype):
-        ''' Process a single file '''
-
-        cfi = get_cfi(path)
-        if not cfi:
+            return self._fct(path, elftype, cfi)
+        except DWARFError:
            return None

-        data = FdeData()

-        for entry in cfi:
-            if isinstance(entry, callframe.CIE):  # Is a CIE
-                self.process_cie(entry, data)
-            elif isinstance(entry, callframe.FDE):  # Is a FDE
-                self.process_fde(entry, data)
+def process_wrapper(fct):
+    return ProcessWrapper(fct)

-        out = SingleFdeData(path, elftype, data)
-        self.shared_queue.put(out)

-    def incr_cell(self, table, key):
-        ''' Increments table[key], or sets it to 1 if unset '''
-        if key in table:
-            table[key] += 1
+@process_wrapper
+def process_elf(path, elftype, cfi):
+    ''' Process a single file '''
+
+    data = FdeData()
+
+    for entry in cfi:
+        if isinstance(entry, callframe.CIE):  # Is a CIE
+            process_cie(entry, data)
+        elif isinstance(entry, callframe.FDE):  # Is a FDE
+            process_fde(entry, data)
+
+    return SingleFdeData(path, elftype, data)
+
+
+def incr_cell(table, key):
+    ''' Increments table[key], or sets it to 1 if unset '''
+    if key in table:
+        table[key] += 1
+    else:
+        table[key] = 1
+
+
+def process_cie(cie, data):
+    ''' Process a CIE '''
+    pass  # Nothing needed from a CIE
+
+
+def process_fde(fde, data):
+    ''' Process a FDE '''
+    data.fde_count += 1
+
+    decoded = fde.get_decoded()
+    row_count = len(decoded.table)
+    incr_cell(data.fde_with_lines, row_count)
+
+    for row in decoded.table:
+        process_reg(data.regs.cfa, row['cfa'])
+        for entry in row:
+            if isinstance(entry, int):
+                process_reg(data.regs.regs[entry], row[entry])
+
+
+def process_reg(out_reg, reg_def):
+    ''' Process a register '''
+    if isinstance(reg_def, callframe.CFARule):
+        if reg_def.reg is not None:
+            out_reg.regs[reg_def.reg] += 1
        else:
-            table[key] = 1
-
-    def process_cie(self, cie, data):
-        ''' Process a CIE '''
-        pass  # Nothing needed from a CIE
-
-    def process_fde(self, fde, data):
-        ''' Process a FDE '''
-        data.fde_count += 1
-
-        decoded = fde.get_decoded()
-        row_count = len(decoded.table)
-        self.incr_cell(data.fde_with_lines, row_count)
-
-        for row in decoded.table:
-            self.process_reg(data.regs.cfa, row['cfa'])
-            for entry in row:
-                if isinstance(entry, int):
-                    self.process_reg(data.regs.regs[entry], row[entry])
-
-    def process_reg(self, out_reg, reg_def):
-        ''' Process a register '''
-        if isinstance(reg_def, callframe.CFARule):
-            if reg_def.reg is not None:
-                out_reg.regs[reg_def.reg] += 1
-            else:
-                pass  # TODO exprs
-        else:
-            self.incr_cell(out_reg.instrs, DwarfInstr.of_pyelf(reg_def.type))
-            if reg_def.type == callframe.RegisterRule.REGISTER:
-                out_reg.regs[reg_def.arg] += 1
-            elif (reg_def.type == callframe.RegisterRule.EXPRESSION) \
-                    or (reg_def.type == callframe.RegisterRule.VAL_EXPRESSION):
-                pass  # TODO exprs
+            pass  # TODO exprs
+    else:
+        incr_cell(out_reg.instrs, DwarfInstr.of_pyelf(reg_def.type))
+        if reg_def.type == callframe.RegisterRule.REGISTER:
+            out_reg.regs[reg_def.arg] += 1
+        elif (reg_def.type == callframe.RegisterRule.EXPRESSION) \
+                or (reg_def.type == callframe.RegisterRule.VAL_EXPRESSION):
+            pass  # TODO exprs


 def gather_system_files(config, sample_size=None):
    stats_accu = StatsAccumulator()
-    processors = []
-
-    def signal_graceful_exit(sig, frame):
-        ''' Stop gracefully now '''
-        nonlocal processors
-
-        print("Stopping after this ELF…")
-        for processor in processors:
-            processor.stop_processing_now()
-
-    signal.signal(signal.SIGINT, signal_graceful_exit)

    elf_list = []
    for elf_path in system_elfs():
@ -126,46 +102,46 @@ def gather_system_files(config, sample_size=None):
        elf_list_sampled = random.sample(elf_list, sample_size)
        elf_list = elf_list_sampled

-    elf_count = len(elf_list)
-    elf_per_process = elf_count // config.cores
-    elf_list_slices = []
-    for i in range(config.cores - 1):
-        elf_list_slices.append(
-            elf_list[i * elf_per_process : (i+1) * elf_per_process])
-    elf_list_slices.append(
-        elf_list[(config.cores - 1) * elf_per_process
-                 : config.cores * elf_per_process])
-
-    shared_queue = multiprocessing.Queue(elf_count)
-
-    for elf_range in elf_list_slices:
-        processors.append(FilesProcessor(elf_range, shared_queue))
-
    if config.cores > 1:
-        for processor in processors:
-            processor.start()
-
-        while True:
-            for processor in processors:
-                if processor.is_alive():
-                    print("== Waiting {} ({} {}) ==".format(
-                        processor.name, processor.exitcode,
-                        processor.is_alive()))
-                    processor.join(timeout=1)
-                    if processor.exitcode is None:
-                        break  # Loop around
-                print("== Joined {} ==".format(processor.name))
-
-            terminated = True
-            for processor in processors:
-                if processor.exitcode is None:
-                    terminated = False
-            if terminated:
-                break
+        with concurrent.futures.ProcessPoolExecutor(max_workers=config.cores)\
+                as executor:
+            for fde in executor.map(process_elf, elf_list):
+                stats_accu.add_fde(fde)
    else:
-        processors[0].run()  # run(), not start(): in the same thread
-
-    while not shared_queue.empty():  # Reliable because everything is joined
-        stats_accu.add_fde(shared_queue.get_nowait())
+        for elf in elf_list:
+            stats_accu.add_fde(process_elf(elf))

    return stats_accu
+
+
+def map_system_files(mapper, sample_size=None, cores=None, include=None,
+                     elflist=None):
+    ''' `mapper` must take (path, elf_type, cfi) '''
+    if cores is None:
+        cores = 1
+    if include is None:
+        include = []
+
+    mapper = process_wrapper(mapper)
+
+    if elflist is None:
+        elf_list = []
+        for elf_path in system_elfs():
+            elf_list.append(elf_path)
+
+        if sample_size is not None:
+            elf_list_sampled = random.sample(elf_list, sample_size)
+            elf_list = elf_list_sampled
+
+        elf_list += list(map(lambda x: (x, None), include))
+    else:
+        elf_list = elflist
+
+    if cores > 1:
+        with concurrent.futures.ProcessPoolExecutor(max_workers=cores)\
+                as executor:
+            out = executor.map(mapper, elf_list)
+    else:
+        out = map(mapper, elf_list)
+
+    return out, elf_list
--- a/stats/helpers.py
+++ b/stats/helpers.py
@ -0,0 +1,228 @@
+from elftools.dwarf import callframe
+import gather_stats
+import itertools
+import functools
+
+REGS_IDS = {
+    'RAX': 0,
+    'RDX': 1,
+    'RCX': 2,
+    'RBX': 3,
+    'RSI': 4,
+    'RDI': 5,
+    'RBP': 6,
+    'RSP': 7,
+    'R8':  8,
+    'R9':  9,
+    'R10': 10,
+    'R11': 11,
+    'R12': 12,
+    'R13': 13,
+    'R14': 14,
+    'R15': 15,
+    'RIP': 16
+}
+
+ID_TO_REG = [
+    'RAX',
+    'RDX',
+    'RCX',
+    'RBX',
+    'RSI',
+    'RDI',
+    'RBP',
+    'RSP',
+    'R8',
+    'R9',
+    'R10',
+    'R11',
+    'R12',
+    'R13',
+    'R14',
+    'R15',
+    'RIP',
+]
+
+HANDLED_REGS = list(map(lambda x: REGS_IDS[x], [
+    'RIP',
+    'RSP',
+    'RBP',
+    'RBX',
+]))
+
+ONLY_HANDLED_REGS = True  # only analyzed handled regs columns
+
+PLT_EXPR = [119, 8, 128, 0, 63, 26, 59, 42, 51, 36, 34]  # Handled exp
+
+
+def accumulate_regs(reg_list):
+    out = [0] * 17
+    for lst in reg_list:
+        for pos in range(len(lst)):
+            out[pos] += lst[pos]
+
+    return out
+
+
+def filter_none(lst):
+    for x in lst:
+        if x:
+            yield x
+
+
+def deco_filter_none(fct):
+    def wrap(lst):
+        return fct(filter_none(lst))
+    return wrap
+
+
+class FdeProcessor:
+    def __init__(self, fct, reducer=None):
+        self._fct = fct
+        self._reducer = reducer
+
+    def __call__(self, path, elftype, cfi):
+        out = []
+        for entry in cfi:
+            if isinstance(entry, callframe.FDE):
+                decoded = entry.get_decoded()
+                out.append(self._fct(path, entry, decoded))
+        if self._reducer is not None and len(out) >= 2:
+            out = [self._reducer(out)]
+        return out
+
+
+class FdeProcessorReduced:
+    def __init__(self, reducer):
+        self._reducer = reducer
+
+    def __call__(self, fct):
+        return FdeProcessor(fct, self._reducer)
+
+
+def fde_processor(fct):
+    return FdeProcessor(fct)
+
+
+def fde_processor_reduced(reducer):
+    return FdeProcessorReduced(reducer)
+
+
+def is_handled_expr(expr):
+    if expr == PLT_EXPR:
+        return True
+
+    if len(expr) == 2 and 0x70 <= expr[0] <= 0x89:
+        if expr[0] - 0x70 in HANDLED_REGS:
+            return True
+    return False
+
+
+# @fde_processor
+def find_non_cfa(path, fde, decoded):
+    regs_seen = 0
+    non_handled_regs = 0
+    non_handled_exp = 0
+    cfa_dat = [0, 0]  # Seen, expr
+    rule_type = {
+        callframe.RegisterRule.UNDEFINED: 0,
+        callframe.RegisterRule.SAME_VALUE: 0,
+        callframe.RegisterRule.OFFSET: 0,
+        callframe.RegisterRule.VAL_OFFSET: 0,
+        callframe.RegisterRule.REGISTER: 0,
+        callframe.RegisterRule.EXPRESSION: 0,
+        callframe.RegisterRule.VAL_EXPRESSION: 0,
+        callframe.RegisterRule.ARCHITECTURAL: 0,
+    }
+    problematic_paths = set()
+
+    for row in decoded.table:
+        for entry in row:
+            reg_def = row[entry]
+
+            if entry == 'cfa':
+                cfa_dat[0] += 1
+                if reg_def.expr:
+                    cfa_dat[1] += 1
+                    if not is_handled_expr(reg_def.expr):
+                        non_handled_exp += 1
+                        problematic_paths.add(path)
+                elif reg_def:
+                    if reg_def.reg not in HANDLED_REGS:
+                        non_handled_regs += 1
+                        problematic_paths.add(path)
+            if not isinstance(entry, int):  # CFA or PC
+                continue
+
+            if ONLY_HANDLED_REGS and entry not in HANDLED_REGS:
+                continue
+
+            rule_type[reg_def.type] += 1
+            reg_rule = reg_def.type
+
+            if reg_rule in [callframe.RegisterRule.OFFSET,
+                            callframe.RegisterRule.VAL_OFFSET]:
+                regs_seen += 1  # CFA
+            elif reg_rule == callframe.RegisterRule.REGISTER:
+                regs_seen += 1
+                if reg_def.arg not in HANDLED_REGS:
+                    problematic_paths.add(path)
+                    non_handled_regs += 1
+            elif reg_rule in [callframe.RegisterRule.EXPRESSION,
+                              callframe.RegisterRule.VAL_EXPRESSION]:
+                expr = reg_def.arg
+                if not is_handled_expr(reg_def.arg):
+                    problematic_paths.add(path)
+                    with open('/tmp/exprs', 'a') as handle:
+                        handle.write('[{} - {}] {}\n'.format(
+                            path, fde.offset,
+                            ', '.join(map(lambda x: hex(x), expr))))
+                    non_handled_exp += 1
+
+    return (regs_seen, non_handled_regs, non_handled_exp, rule_type, cfa_dat,
+            problematic_paths)
+
+
+def reduce_non_cfa(lst):
+    def merge_dict(d1, d2):
+        for x in d1:
+            d1[x] += d2[x]
+        return d1
+
+    def merge_list(l1, l2):
+        out = []
+        for pos in range(len(l1)):  # Implicit assumption len(l1) == len(l2)
+            out.append(l1[pos] + l2[pos])
+        return out
+
+    def merge_elts(accu, elt):
+        accu_regs, accu_nh, accu_exp, accu_rt, accu_cfa, accu_paths = accu
+        elt_regs, elt_nh, elt_exp, elt_rt, elt_cfa, elf_paths = elt
+        return (
+            accu_regs + elt_regs,
+            accu_nh + elt_nh,
+            accu_exp + elt_exp,
+            merge_dict(accu_rt, elt_rt),
+            merge_list(accu_cfa, elt_cfa),
+            accu_paths.union(elf_paths),
+        )
+
+    return functools.reduce(merge_elts, lst)
+
+
+@deco_filter_none
+def flatten_non_cfa(result):
+    flat = itertools.chain.from_iterable(result)
+    out = reduce_non_cfa(flat)
+    out_cfa = {
+        'seen': out[4][0],
+        'expr': out[4][1],
+        'offset': out[4][0] - out[4][1],
+    }
+    out = (out[0],
+           (out[1], out[0] + out_cfa['offset']),
+           (out[2], out[3]['EXPRESSION'] + out_cfa['expr']),
+           out[3],
+           out_cfa,
+           out[5])
+    return out
--- a/stats/pyelftools_overlay.py
+++ b/stats/pyelftools_overlay.py
@ -6,6 +6,11 @@ from stats_accu import ElfType
 import os


+ELF_BLACKLIST = [
+    '/usr/lib/libavcodec.so',
+]
+
+
 def get_cfi(path):
    ''' Get the CFI entries from the ELF at the provided path '''

@ -14,6 +19,7 @@ def get_cfi(path):
            elf_file = ELFFile(file_handle)

            if not elf_file.has_dwarf_info():
+                print("No DWARF")
                return None

            dw_info = elf_file.get_dwarf_info()
@ -22,12 +28,19 @@ def get_cfi(path):
            elif dw_info.has_EH_CFI():
                cfis = dw_info.EH_CFI_entries()
            else:
+                print("No CFI")
                return None
    except ELFError:
+        print("ELF Error")
        return None
    except DWARFError:
+        print("DWARF Error")
        return None
    except PermissionError:
+        print("Permission Error")
+        return None
+    except KeyError:
+        print("Key Error")
        return None

    return cfis
@ -70,6 +83,9 @@ def system_elfs():
                continue

            canonical_name = readlink_rec(direntry.path)
+            for blacked in ELF_BLACKLIST:
+                if canonical_name.startswith(blacked):
+                    continue
            if canonical_name in seen_elfs:
                continue

@ -79,10 +95,16 @@ def system_elfs():
                    magic_bytes = handle.read(4)
                    if magic_bytes != b'\x7fELF':
                        valid_elf = False
+                    elf_class = handle.read(1)
+                    if elf_class != b'\x02':  # ELF64
+                        valid_elf = False
            except Exception:
                continue
            if not valid_elf:
                continue

+            if not os.path.isfile(canonical_name):
+                continue
+
            seen_elfs.add(canonical_name)
            yield (canonical_name, elftype)
--- a/stats/requirements.txt
+++ b/stats/requirements.txt
@ -1,2 +1 @@
 git+https://github.com/eliben/pyelftools
-git+https://github.com/uqfoundation/pathos
--- a/stats/stats_accu.py
+++ b/stats/stats_accu.py
@ -239,7 +239,8 @@ class StatsAccumulator:
        self.fdes = []

    def add_fde(self, fde_data):
-        self.fdes.append(fde_data)
+        if fde_data:
+            self.fdes.append(fde_data)

    def get_fdes(self):
        return self.fdes
@ -250,7 +251,6 @@ class StatsAccumulator:

    def dump(self, path):
        dict_form = [fde.dump() for fde in self.fdes]
-        print(dict_form)
        with open(path, 'w') as handle:
            handle.write(json.dumps(dict_form))