From 216e442f5bcb3823e93b872f45875b54522ac67e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Tue, 17 Jul 2018 11:36:56 +0200 Subject: [PATCH] Tentative progress in stats --- stats/fde_stats.py | 25 +++- stats/gather_stats.py | 169 +++++++++++++++++++---- stats/pyelftools_overlay.py | 32 +++-- stats/stats_accu.py | 260 ++++++++++++++++++++++++------------ 4 files changed, 359 insertions(+), 127 deletions(-) diff --git a/stats/fde_stats.py b/stats/fde_stats.py index 38d384c..05bad0d 100755 --- a/stats/fde_stats.py +++ b/stats/fde_stats.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +from stats_accu import StatsAccumulator import gather_stats import argparse @@ -15,6 +16,9 @@ class Config: if args.feature == 'gather': self.output = args.output + elif args.feature == 'sample': + self.size = int(args.size) + elif args.feature == 'analyze': self.data_file = args.data_file @@ -34,6 +38,19 @@ class Config: subparsers = parser.add_subparsers(help='Subcommands') + # Sample stats + parser_sample = subparsers.add_parser( + 'sample', + help='Same as gather, but for a random subset of files') + parser_sample.set_defaults(feature='sample') + parser_sample.add_argument('--size', '-n', + default=1000, + help=('Pick this number of files')) + parser_sample.add_argument('--output', '-o', + default='elf_data', + help=('Output data to this file. Defaults ' + 'to "elf_data"')) + # Gather stats parser_gather = subparsers.add_parser( 'gather', @@ -70,11 +87,17 @@ def main(): if config.feature == 'gather': stats_accu = gather_stats.gather_system_files(config) - stats_accu.serialize(config.output) + stats_accu.dump(config.output) + + elif config.feature == 'sample': + stats_accu = gather_stats.gather_system_files( + config, + sample_size=config.size) elif config.feature == 'analyze': # TODO print("Not implemented", file=sys.stderr) + stats_accu = StatsAccumulator.load(config.data_file) sys.exit(1) diff --git a/stats/gather_stats.py b/stats/gather_stats.py index cc63197..9dd32d0 100644 --- a/stats/gather_stats.py +++ b/stats/gather_stats.py @@ -1,53 +1,120 @@ -from pyelftools_overlay import system_elfs -import pathos +from pyelftools_overlay import system_elfs, get_cfi +from elftools.dwarf import callframe +import multiprocessing import signal -import itertools +import random -from stats_accu import StatsAccumulator +from stats_accu import \ + StatsAccumulator, SingleFdeData, \ + RegsList, FdeData, DwarfInstr -class FilesProcessor: - def __init__(self, cores, stats_accu=None): +class FilesProcessor(multiprocessing.Process): + def __init__(self, elf_list, shared_queue): + super().__init__() self.stop_processing = False - self._processed_counter = itertools.count() - self.cores = cores - - if stats_accu is None: - stats_accu = StatsAccumulator() - self.stats_accu = stats_accu + self.processed_counter = 0 + self.elf_list = elf_list + self.shared_queue = shared_queue def stop_processing_now(self): self.stop_processing = True - def next_counter(self): - return self._processed_counter.__next__() + def run(self): + pos = 0 + for descr in self.elf_list: + if self.stop_processing: + break + self.process_single_file(descr, pos) + pos += 1 - def run(self, elf_list): - self.elf_count = len(elf_list) - with pathos.multiprocessing.ProcessPool(nodes=self.cores) as pool: - pool.map(self.process_single_file, elf_list) + print("=== Finished {} ===".format(self.name)) + return 0 - def process_single_file(self, elf_path): + def process_single_file(self, elf_descr, pos_in_list): if self.stop_processing: return - cur_file_count = self.next_counter() - print('> [{}/{} {:.0f}%] {}'.format( - cur_file_count, self.elf_count, - cur_file_count / self.elf_count * 100, elf_path)) - self.stats_accu.process_file(elf_path) + elf_path, elf_type = elf_descr + + self.processed_counter += 1 + print('[{}, {}/{}] {}'.format( + self.shared_queue.qsize(), + pos_in_list + 1, + len(self.elf_list), + elf_path)) + self.process_file(elf_path, elf_type) + + def process_file(self, path, elftype): + ''' Process a single file ''' + + cfi = get_cfi(path) + if not cfi: + return None + + data = FdeData() + + for entry in cfi: + if isinstance(entry, callframe.CIE): # Is a CIE + self.process_cie(entry, data) + elif isinstance(entry, callframe.FDE): # Is a FDE + self.process_fde(entry, data) + + out = SingleFdeData(path, elftype, data) + self.shared_queue.put(out) + + def incr_cell(self, table, key): + ''' Increments table[key], or sets it to 1 if unset ''' + if key in table: + table[key] += 1 + else: + table[key] = 1 + + def process_cie(self, cie, data): + ''' Process a CIE ''' + pass # Nothing needed from a CIE + + def process_fde(self, fde, data): + ''' Process a FDE ''' + data.fde_count += 1 + + decoded = fde.get_decoded() + row_count = len(decoded.table) + self.incr_cell(data.fde_with_lines, row_count) + + for row in decoded.table: + self.process_reg(data.regs.cfa, row['cfa']) + for entry in row: + if isinstance(entry, int): + self.process_reg(data.regs.regs[entry], row[entry]) + + def process_reg(self, out_reg, reg_def): + ''' Process a register ''' + if isinstance(reg_def, callframe.CFARule): + if reg_def.reg is not None: + out_reg.regs[reg_def.reg] += 1 + else: + pass # TODO exprs + else: + self.incr_cell(out_reg.instrs, DwarfInstr.of_pyelf(reg_def.type)) + if reg_def.type == callframe.RegisterRule.REGISTER: + out_reg.regs[reg_def.arg] += 1 + elif (reg_def.type == callframe.RegisterRule.EXPRESSION) \ + or (reg_def.type == callframe.RegisterRule.VAL_EXPRESSION): + pass # TODO exprs -def gather_system_files(config): +def gather_system_files(config, sample_size=None): stats_accu = StatsAccumulator() - processor = FilesProcessor(config.cores, stats_accu) + processors = [] def signal_graceful_exit(sig, frame): ''' Stop gracefully now ''' - nonlocal processor + nonlocal processors print("Stopping after this ELF…") - processor.stop_processing_now() + for processor in processors: + processor.stop_processing_now() signal.signal(signal.SIGINT, signal_graceful_exit) @@ -55,6 +122,50 @@ def gather_system_files(config): for elf_path in system_elfs(): elf_list.append(elf_path) - processor.run(elf_list) + if sample_size is not None: + elf_list_sampled = random.sample(elf_list, sample_size) + elf_list = elf_list_sampled + + elf_count = len(elf_list) + elf_per_process = elf_count // config.cores + elf_list_slices = [] + for i in range(config.cores - 1): + elf_list_slices.append( + elf_list[i * elf_per_process : (i+1) * elf_per_process]) + elf_list_slices.append( + elf_list[(config.cores - 1) * elf_per_process + : config.cores * elf_per_process]) + + shared_queue = multiprocessing.Queue(elf_count) + + for elf_range in elf_list_slices: + processors.append(FilesProcessor(elf_range, shared_queue)) + + if config.cores > 1: + for processor in processors: + processor.start() + + while True: + for processor in processors: + if processor.is_alive(): + print("== Waiting {} ({} {}) ==".format( + processor.name, processor.exitcode, + processor.is_alive())) + processor.join(timeout=1) + if processor.exitcode is None: + break # Loop around + print("== Joined {} ==".format(processor.name)) + + terminated = True + for processor in processors: + if processor.exitcode is None: + terminated = False + if terminated: + break + else: + processors[0].run() # run(), not start(): in the same thread + + while not shared_queue.empty(): # Reliable because everything is joined + stats_accu.add_fde(shared_queue.get_nowait()) return stats_accu diff --git a/stats/pyelftools_overlay.py b/stats/pyelftools_overlay.py index 76c7d04..7a27422 100644 --- a/stats/pyelftools_overlay.py +++ b/stats/pyelftools_overlay.py @@ -2,6 +2,7 @@ from elftools.elf.elffile import ELFFile from elftools.common.exceptions import ELFError, DWARFError +from stats_accu import ElfType import os @@ -44,20 +45,20 @@ def system_elfs(): os.readlink(path))) sysbin_dirs = [ - '/lib', - '/usr/lib', - '/usr/local/lib', - '/bin', - '/usr/bin', - '/usr/local/bin', - '/sbin', + ('/lib', ElfType.ELF_LIB), + ('/usr/lib', ElfType.ELF_LIB), + ('/usr/local/lib', ElfType.ELF_LIB), + ('/bin', ElfType.ELF_BINARY), + ('/usr/bin', ElfType.ELF_BINARY), + ('/usr/local/bin', ElfType.ELF_BINARY), + ('/sbin', ElfType.ELF_BINARY), ] to_explore = sysbin_dirs seen_elfs = set() while to_explore: - bindir = to_explore.pop() + bindir, elftype = to_explore.pop() if not os.path.isdir(bindir): continue @@ -65,12 +66,23 @@ def system_elfs(): for direntry in os.scandir(bindir): if not direntry.is_file(): if direntry.is_dir(): - to_explore.append(direntry.path) + to_explore.append((direntry.path, elftype)) continue canonical_name = readlink_rec(direntry.path) if canonical_name in seen_elfs: continue + valid_elf = True + try: + with open(canonical_name, 'rb') as handle: + magic_bytes = handle.read(4) + if magic_bytes != b'\x7fELF': + valid_elf = False + except Exception: + continue + if not valid_elf: + continue + seen_elfs.add(canonical_name) - yield canonical_name + yield (canonical_name, elftype) diff --git a/stats/stats_accu.py b/stats/stats_accu.py index 3a2995c..f1f7651 100644 --- a/stats/stats_accu.py +++ b/stats/stats_accu.py @@ -1,9 +1,9 @@ from elftools.dwarf import callframe -from pyelftools_overlay import get_cfi -from enum import Enum -import json +import enum import subprocess import re +import json +import collections from math import ceil @@ -69,109 +69,195 @@ def elf_so_deps(path): "{}.").format(path, exn.returncode)) -class ElfType(Enum): - ELF_LIB = auto() - ELF_BINARY = auto() +class ElfType(enum.Enum): + ELF_LIB = enum.auto() + ELF_BINARY = enum.auto() + + +class DwarfInstr(enum.Enum): + @staticmethod + def of_pyelf(val): + _table = { + callframe.RegisterRule.UNDEFINED: DwarfInstr.INSTR_UNDEF, + callframe.RegisterRule.SAME_VALUE: DwarfInstr.INSTR_SAME_VALUE, + callframe.RegisterRule.OFFSET: DwarfInstr.INSTR_OFFSET, + callframe.RegisterRule.VAL_OFFSET: DwarfInstr.INSTR_VAL_OFFSET, + callframe.RegisterRule.REGISTER: DwarfInstr.INSTR_REGISTER, + callframe.RegisterRule.EXPRESSION: DwarfInstr.INSTR_EXPRESSION, + callframe.RegisterRule.VAL_EXPRESSION: + DwarfInstr.INSTR_VAL_EXPRESSION, + callframe.RegisterRule.ARCHITECTURAL: + DwarfInstr.INSTR_ARCHITECTURAL, + } + return _table[val] + + INSTR_UNDEF = enum.auto() + INSTR_SAME_VALUE = enum.auto() + INSTR_OFFSET = enum.auto() + INSTR_VAL_OFFSET = enum.auto() + INSTR_REGISTER = enum.auto() + INSTR_EXPRESSION = enum.auto() + INSTR_VAL_EXPRESSION = enum.auto() + INSTR_ARCHITECTURAL = enum.auto() + + +def intify_dict(d): + out = {} + for key in d: + try: + nKey = int(key) + except Exception: + nKey = key + + try: + out[nKey] = int(d[key]) + except ValueError: + out[nKey] = d[key] + return out + + +class RegData: + def __init__(self, instrs=None, regs=None, exprs=None): + if instrs is None: + instrs = {} + if regs is None: + regs = [0]*17 + if exprs is None: + exprs = {} + self.instrs = intify_dict(instrs) + self.regs = regs + self.exprs = intify_dict(exprs) + + @staticmethod + def map_dict_keys(fnc, dic): + out = {} + for key in dic: + out[fnc(key)] = dic[key] + return out + + def dump(self): + return { + 'instrs': RegData.map_dict_keys(lambda x: x.value, self.instrs), + 'regs': self.regs, + 'exprs': self.exprs, + } + + @staticmethod + def load(data): + return RegData( + instrs=RegData.map_dict_keys( + lambda x: DwarfInstr(int(x)), + data['instrs']), + regs=data['regs'], + exprs=data['exprs'], + ) + + +class RegsList: + def __init__(self, cfa=None, regs=None): + if cfa is None: + cfa = RegsList.fresh_reg() + if regs is None: + regs = [RegsList.fresh_reg() for _ in range(17)] + self.cfa = cfa + self.regs = regs + + @staticmethod + def fresh_reg(): + return RegData() + + def dump(self): + return { + 'cfa': RegData.dump(self.cfa), + 'regs': [RegData.dump(r) for r in self.regs], + } + + @staticmethod + def load(data): + return RegsList( + cfa=RegData.load(data['cfa']), + regs=[RegData.load(r) for r in data['regs']], + ) + + +class FdeData: + def __init__(self, fde_count=0, fde_with_lines=None, regs=None): + if fde_with_lines is None: + fde_with_lines = {} + if regs is None: + regs = RegsList() + + self.fde_count = fde_count + self.fde_with_lines = intify_dict(fde_with_lines) + self.regs = regs + + def dump(self): + return { + 'fde_count': self.fde_count, + 'fde_with_lines': self.fde_with_lines, + 'regs': self.regs.dump(), + } + + @staticmethod + def load(data): + return FdeData( + fde_count=int(data['fde_count']), + fde_with_lines=data['fde_with_lines'], + regs=RegsList.load(data['regs'])) class SingleFdeData: def __init__(self, path, elf_type, data): self.path = path self.elf_type = elf_type - self.data = data + self.data = data # < of type FdeData self.gather_deps() def gather_deps(self): """ Collect ldd data on the binary """ - self.deps = elf_so_deps(self.path) + # self.deps = elf_so_deps(self.path) + self.deps = [] + + def dump(self): + return { + 'path': self.path, + 'elf_type': self.elf_type.value, + 'data': self.data.dump() + } + + @staticmethod + def load(data): + return SingleFdeData( + data['path'], + ElfType(int(data['elf_type'])), + FdeData.load(data['data'])) class StatsAccumulator: def __init__(self): - self.elf_count = 0 - self.fde_count = 0 - self.fde_row_count = 0 - self.fde_with_n_rows = {} + self.fdes = [] - def serialize(self, path): - ''' Save the gathered data to `stream` ''' + def add_fde(self, fde_data): + self.fdes.append(fde_data) - notable_fields = [ - 'elf_count', - 'fde_count', - 'fde_row_count', - 'fde_with_n_rows', - ] - out = {} - for field in notable_fields: - out[field] = self.__dict__[field] + def get_fdes(self): + return self.fdes - with open(path, 'wb') as stream: - json.dump(out, stream) + def add_stats_accu(self, stats_accu): + for fde in stats_accu.get_fdes(): + self.add_fde(fde) + + def dump(self, path): + dict_form = [fde.dump() for fde in self.fdes] + print(dict_form) + with open(path, 'w') as handle: + handle.write(json.dumps(dict_form)) @staticmethod - def unserialize(path): + def load(path): + with open(path, 'r') as handle: + text = handle.read() out = StatsAccumulator() - with open(path, 'wb') as stream: - data = json.load(stream) - for field in data: - out.field = data[field] + out.fdes = [SingleFdeData.load(data) for data in json.loads(text)] return out - - def report(self): - ''' Report on the statistics gathered ''' - - self.fde_rows_proportion = ProportionFinder( - self.fde_with_n_rows) - - rows = [ - ("ELFs analyzed", self.elf_count), - ("FDEs analyzed", self.fde_count), - ("FDE rows analyzed", self.fde_row_count), - ("Avg. rows per FDE", self.fde_row_count / self.fde_count), - ("Median rows per FDE", - self.fde_rows_proportion.find_at_proportion(0.5)), - ("Max rows per FDE", max(self.fde_with_n_rows.keys())), - ] - - title_size = max(map(lambda x: len(x[0]), rows)) - line_format = "{:<" + str(title_size + 1) + "} {}" - - for row in rows: - print(line_format.format(row[0], row[1])) - - def process_file(self, path): - ''' Process a single file ''' - - cfi = get_cfi(path) - if not cfi: - return - - self.elf_count += 1 - - for entry in cfi: - if isinstance(entry, callframe.CIE): # Is a CIE - self.process_cie(entry) - elif isinstance(entry, callframe.FDE): # Is a FDE - self.process_fde(entry) - - def incr_cell(self, table, key): - ''' Increments table[key], or sets it to 1 if unset ''' - if key in table: - table[key] += 1 - else: - table[key] = 1 - - def process_cie(self, cie): - ''' Process a CIE ''' - pass # Nothing needed from a CIE - - def process_fde(self, fde): - ''' Process a FDE ''' - self.fde_count += 1 - - decoded = fde.get_decoded() - row_count = len(decoded.table) - self.fde_row_count += row_count - self.incr_cell(self.fde_with_n_rows, row_count)