diff --git a/stats/.gitignore b/stats/.gitignore index 7a12b44..b9f1756 100644 --- a/stats/.gitignore +++ b/stats/.gitignore @@ -1,2 +1,3 @@ venv -elf_data +elf_data* +gathered diff --git a/stats/fde_stats.py b/stats/fde_stats.py index 05bad0d..f9e9aef 100755 --- a/stats/fde_stats.py +++ b/stats/fde_stats.py @@ -18,6 +18,7 @@ class Config: elif args.feature == 'sample': self.size = int(args.size) + self.output = args.output elif args.feature == 'analyze': self.data_file = args.data_file @@ -93,9 +94,9 @@ def main(): stats_accu = gather_stats.gather_system_files( config, sample_size=config.size) + stats_accu.dump(config.output) elif config.feature == 'analyze': - # TODO print("Not implemented", file=sys.stderr) stats_accu = StatsAccumulator.load(config.data_file) sys.exit(1) diff --git a/stats/gather_stats.py b/stats/gather_stats.py index 9dd32d0..c891b6b 100644 --- a/stats/gather_stats.py +++ b/stats/gather_stats.py @@ -1,122 +1,98 @@ +from elftools.common.exceptions import DWARFError from pyelftools_overlay import system_elfs, get_cfi from elftools.dwarf import callframe -import multiprocessing -import signal +import concurrent.futures import random + from stats_accu import \ - StatsAccumulator, SingleFdeData, \ - RegsList, FdeData, DwarfInstr + StatsAccumulator, SingleFdeData, FdeData, DwarfInstr -class FilesProcessor(multiprocessing.Process): - def __init__(self, elf_list, shared_queue): - super().__init__() - self.stop_processing = False - self.processed_counter = 0 - self.elf_list = elf_list - self.shared_queue = shared_queue +class ProcessWrapper: + def __init__(self, fct): + self._fct = fct - def stop_processing_now(self): - self.stop_processing = True + def __call__(self, elf_descr): + try: + path, elftype = elf_descr - def run(self): - pos = 0 - for descr in self.elf_list: - if self.stop_processing: - break - self.process_single_file(descr, pos) - pos += 1 + print("Processing {}…".format(path)) - print("=== Finished {} ===".format(self.name)) - return 0 + cfi = get_cfi(path) + if not cfi: + return None - def process_single_file(self, elf_descr, pos_in_list): - if self.stop_processing: - return - - elf_path, elf_type = elf_descr - - self.processed_counter += 1 - print('[{}, {}/{}] {}'.format( - self.shared_queue.qsize(), - pos_in_list + 1, - len(self.elf_list), - elf_path)) - self.process_file(elf_path, elf_type) - - def process_file(self, path, elftype): - ''' Process a single file ''' - - cfi = get_cfi(path) - if not cfi: + return self._fct(path, elftype, cfi) + except DWARFError: return None - data = FdeData() - for entry in cfi: - if isinstance(entry, callframe.CIE): # Is a CIE - self.process_cie(entry, data) - elif isinstance(entry, callframe.FDE): # Is a FDE - self.process_fde(entry, data) +def process_wrapper(fct): + return ProcessWrapper(fct) - out = SingleFdeData(path, elftype, data) - self.shared_queue.put(out) - def incr_cell(self, table, key): - ''' Increments table[key], or sets it to 1 if unset ''' - if key in table: - table[key] += 1 +@process_wrapper +def process_elf(path, elftype, cfi): + ''' Process a single file ''' + + data = FdeData() + + for entry in cfi: + if isinstance(entry, callframe.CIE): # Is a CIE + process_cie(entry, data) + elif isinstance(entry, callframe.FDE): # Is a FDE + process_fde(entry, data) + + return SingleFdeData(path, elftype, data) + + +def incr_cell(table, key): + ''' Increments table[key], or sets it to 1 if unset ''' + if key in table: + table[key] += 1 + else: + table[key] = 1 + + +def process_cie(cie, data): + ''' Process a CIE ''' + pass # Nothing needed from a CIE + + +def process_fde(fde, data): + ''' Process a FDE ''' + data.fde_count += 1 + + decoded = fde.get_decoded() + row_count = len(decoded.table) + incr_cell(data.fde_with_lines, row_count) + + for row in decoded.table: + process_reg(data.regs.cfa, row['cfa']) + for entry in row: + if isinstance(entry, int): + process_reg(data.regs.regs[entry], row[entry]) + + +def process_reg(out_reg, reg_def): + ''' Process a register ''' + if isinstance(reg_def, callframe.CFARule): + if reg_def.reg is not None: + out_reg.regs[reg_def.reg] += 1 else: - table[key] = 1 - - def process_cie(self, cie, data): - ''' Process a CIE ''' - pass # Nothing needed from a CIE - - def process_fde(self, fde, data): - ''' Process a FDE ''' - data.fde_count += 1 - - decoded = fde.get_decoded() - row_count = len(decoded.table) - self.incr_cell(data.fde_with_lines, row_count) - - for row in decoded.table: - self.process_reg(data.regs.cfa, row['cfa']) - for entry in row: - if isinstance(entry, int): - self.process_reg(data.regs.regs[entry], row[entry]) - - def process_reg(self, out_reg, reg_def): - ''' Process a register ''' - if isinstance(reg_def, callframe.CFARule): - if reg_def.reg is not None: - out_reg.regs[reg_def.reg] += 1 - else: - pass # TODO exprs - else: - self.incr_cell(out_reg.instrs, DwarfInstr.of_pyelf(reg_def.type)) - if reg_def.type == callframe.RegisterRule.REGISTER: - out_reg.regs[reg_def.arg] += 1 - elif (reg_def.type == callframe.RegisterRule.EXPRESSION) \ - or (reg_def.type == callframe.RegisterRule.VAL_EXPRESSION): - pass # TODO exprs + pass # TODO exprs + else: + incr_cell(out_reg.instrs, DwarfInstr.of_pyelf(reg_def.type)) + if reg_def.type == callframe.RegisterRule.REGISTER: + out_reg.regs[reg_def.arg] += 1 + elif (reg_def.type == callframe.RegisterRule.EXPRESSION) \ + or (reg_def.type == callframe.RegisterRule.VAL_EXPRESSION): + pass # TODO exprs def gather_system_files(config, sample_size=None): stats_accu = StatsAccumulator() - processors = [] - - def signal_graceful_exit(sig, frame): - ''' Stop gracefully now ''' - nonlocal processors - - print("Stopping after this ELF…") - for processor in processors: - processor.stop_processing_now() - - signal.signal(signal.SIGINT, signal_graceful_exit) elf_list = [] for elf_path in system_elfs(): @@ -126,46 +102,46 @@ def gather_system_files(config, sample_size=None): elf_list_sampled = random.sample(elf_list, sample_size) elf_list = elf_list_sampled - elf_count = len(elf_list) - elf_per_process = elf_count // config.cores - elf_list_slices = [] - for i in range(config.cores - 1): - elf_list_slices.append( - elf_list[i * elf_per_process : (i+1) * elf_per_process]) - elf_list_slices.append( - elf_list[(config.cores - 1) * elf_per_process - : config.cores * elf_per_process]) - - shared_queue = multiprocessing.Queue(elf_count) - - for elf_range in elf_list_slices: - processors.append(FilesProcessor(elf_range, shared_queue)) - if config.cores > 1: - for processor in processors: - processor.start() - - while True: - for processor in processors: - if processor.is_alive(): - print("== Waiting {} ({} {}) ==".format( - processor.name, processor.exitcode, - processor.is_alive())) - processor.join(timeout=1) - if processor.exitcode is None: - break # Loop around - print("== Joined {} ==".format(processor.name)) - - terminated = True - for processor in processors: - if processor.exitcode is None: - terminated = False - if terminated: - break + with concurrent.futures.ProcessPoolExecutor(max_workers=config.cores)\ + as executor: + for fde in executor.map(process_elf, elf_list): + stats_accu.add_fde(fde) else: - processors[0].run() # run(), not start(): in the same thread - - while not shared_queue.empty(): # Reliable because everything is joined - stats_accu.add_fde(shared_queue.get_nowait()) + for elf in elf_list: + stats_accu.add_fde(process_elf(elf)) return stats_accu + + +def map_system_files(mapper, sample_size=None, cores=None, include=None, + elflist=None): + ''' `mapper` must take (path, elf_type, cfi) ''' + if cores is None: + cores = 1 + if include is None: + include = [] + + mapper = process_wrapper(mapper) + + if elflist is None: + elf_list = [] + for elf_path in system_elfs(): + elf_list.append(elf_path) + + if sample_size is not None: + elf_list_sampled = random.sample(elf_list, sample_size) + elf_list = elf_list_sampled + + elf_list += list(map(lambda x: (x, None), include)) + else: + elf_list = elflist + + if cores > 1: + with concurrent.futures.ProcessPoolExecutor(max_workers=cores)\ + as executor: + out = executor.map(mapper, elf_list) + else: + out = map(mapper, elf_list) + + return out, elf_list diff --git a/stats/helpers.py b/stats/helpers.py new file mode 100644 index 0000000..b93eb0d --- /dev/null +++ b/stats/helpers.py @@ -0,0 +1,228 @@ +from elftools.dwarf import callframe +import gather_stats +import itertools +import functools + +REGS_IDS = { + 'RAX': 0, + 'RDX': 1, + 'RCX': 2, + 'RBX': 3, + 'RSI': 4, + 'RDI': 5, + 'RBP': 6, + 'RSP': 7, + 'R8': 8, + 'R9': 9, + 'R10': 10, + 'R11': 11, + 'R12': 12, + 'R13': 13, + 'R14': 14, + 'R15': 15, + 'RIP': 16 +} + +ID_TO_REG = [ + 'RAX', + 'RDX', + 'RCX', + 'RBX', + 'RSI', + 'RDI', + 'RBP', + 'RSP', + 'R8', + 'R9', + 'R10', + 'R11', + 'R12', + 'R13', + 'R14', + 'R15', + 'RIP', +] + +HANDLED_REGS = list(map(lambda x: REGS_IDS[x], [ + 'RIP', + 'RSP', + 'RBP', + 'RBX', +])) + +ONLY_HANDLED_REGS = True # only analyzed handled regs columns + +PLT_EXPR = [119, 8, 128, 0, 63, 26, 59, 42, 51, 36, 34] # Handled exp + + +def accumulate_regs(reg_list): + out = [0] * 17 + for lst in reg_list: + for pos in range(len(lst)): + out[pos] += lst[pos] + + return out + + +def filter_none(lst): + for x in lst: + if x: + yield x + + +def deco_filter_none(fct): + def wrap(lst): + return fct(filter_none(lst)) + return wrap + + +class FdeProcessor: + def __init__(self, fct, reducer=None): + self._fct = fct + self._reducer = reducer + + def __call__(self, path, elftype, cfi): + out = [] + for entry in cfi: + if isinstance(entry, callframe.FDE): + decoded = entry.get_decoded() + out.append(self._fct(path, entry, decoded)) + if self._reducer is not None and len(out) >= 2: + out = [self._reducer(out)] + return out + + +class FdeProcessorReduced: + def __init__(self, reducer): + self._reducer = reducer + + def __call__(self, fct): + return FdeProcessor(fct, self._reducer) + + +def fde_processor(fct): + return FdeProcessor(fct) + + +def fde_processor_reduced(reducer): + return FdeProcessorReduced(reducer) + + +def is_handled_expr(expr): + if expr == PLT_EXPR: + return True + + if len(expr) == 2 and 0x70 <= expr[0] <= 0x89: + if expr[0] - 0x70 in HANDLED_REGS: + return True + return False + + +# @fde_processor +def find_non_cfa(path, fde, decoded): + regs_seen = 0 + non_handled_regs = 0 + non_handled_exp = 0 + cfa_dat = [0, 0] # Seen, expr + rule_type = { + callframe.RegisterRule.UNDEFINED: 0, + callframe.RegisterRule.SAME_VALUE: 0, + callframe.RegisterRule.OFFSET: 0, + callframe.RegisterRule.VAL_OFFSET: 0, + callframe.RegisterRule.REGISTER: 0, + callframe.RegisterRule.EXPRESSION: 0, + callframe.RegisterRule.VAL_EXPRESSION: 0, + callframe.RegisterRule.ARCHITECTURAL: 0, + } + problematic_paths = set() + + for row in decoded.table: + for entry in row: + reg_def = row[entry] + + if entry == 'cfa': + cfa_dat[0] += 1 + if reg_def.expr: + cfa_dat[1] += 1 + if not is_handled_expr(reg_def.expr): + non_handled_exp += 1 + problematic_paths.add(path) + elif reg_def: + if reg_def.reg not in HANDLED_REGS: + non_handled_regs += 1 + problematic_paths.add(path) + if not isinstance(entry, int): # CFA or PC + continue + + if ONLY_HANDLED_REGS and entry not in HANDLED_REGS: + continue + + rule_type[reg_def.type] += 1 + reg_rule = reg_def.type + + if reg_rule in [callframe.RegisterRule.OFFSET, + callframe.RegisterRule.VAL_OFFSET]: + regs_seen += 1 # CFA + elif reg_rule == callframe.RegisterRule.REGISTER: + regs_seen += 1 + if reg_def.arg not in HANDLED_REGS: + problematic_paths.add(path) + non_handled_regs += 1 + elif reg_rule in [callframe.RegisterRule.EXPRESSION, + callframe.RegisterRule.VAL_EXPRESSION]: + expr = reg_def.arg + if not is_handled_expr(reg_def.arg): + problematic_paths.add(path) + with open('/tmp/exprs', 'a') as handle: + handle.write('[{} - {}] {}\n'.format( + path, fde.offset, + ', '.join(map(lambda x: hex(x), expr)))) + non_handled_exp += 1 + + return (regs_seen, non_handled_regs, non_handled_exp, rule_type, cfa_dat, + problematic_paths) + + +def reduce_non_cfa(lst): + def merge_dict(d1, d2): + for x in d1: + d1[x] += d2[x] + return d1 + + def merge_list(l1, l2): + out = [] + for pos in range(len(l1)): # Implicit assumption len(l1) == len(l2) + out.append(l1[pos] + l2[pos]) + return out + + def merge_elts(accu, elt): + accu_regs, accu_nh, accu_exp, accu_rt, accu_cfa, accu_paths = accu + elt_regs, elt_nh, elt_exp, elt_rt, elt_cfa, elf_paths = elt + return ( + accu_regs + elt_regs, + accu_nh + elt_nh, + accu_exp + elt_exp, + merge_dict(accu_rt, elt_rt), + merge_list(accu_cfa, elt_cfa), + accu_paths.union(elf_paths), + ) + + return functools.reduce(merge_elts, lst) + + +@deco_filter_none +def flatten_non_cfa(result): + flat = itertools.chain.from_iterable(result) + out = reduce_non_cfa(flat) + out_cfa = { + 'seen': out[4][0], + 'expr': out[4][1], + 'offset': out[4][0] - out[4][1], + } + out = (out[0], + (out[1], out[0] + out_cfa['offset']), + (out[2], out[3]['EXPRESSION'] + out_cfa['expr']), + out[3], + out_cfa, + out[5]) + return out diff --git a/stats/pyelftools_overlay.py b/stats/pyelftools_overlay.py index 7a27422..7b1c1ef 100644 --- a/stats/pyelftools_overlay.py +++ b/stats/pyelftools_overlay.py @@ -6,6 +6,11 @@ from stats_accu import ElfType import os +ELF_BLACKLIST = [ + '/usr/lib/libavcodec.so', +] + + def get_cfi(path): ''' Get the CFI entries from the ELF at the provided path ''' @@ -14,6 +19,7 @@ def get_cfi(path): elf_file = ELFFile(file_handle) if not elf_file.has_dwarf_info(): + print("No DWARF") return None dw_info = elf_file.get_dwarf_info() @@ -22,12 +28,19 @@ def get_cfi(path): elif dw_info.has_EH_CFI(): cfis = dw_info.EH_CFI_entries() else: + print("No CFI") return None except ELFError: + print("ELF Error") return None except DWARFError: + print("DWARF Error") return None except PermissionError: + print("Permission Error") + return None + except KeyError: + print("Key Error") return None return cfis @@ -70,6 +83,9 @@ def system_elfs(): continue canonical_name = readlink_rec(direntry.path) + for blacked in ELF_BLACKLIST: + if canonical_name.startswith(blacked): + continue if canonical_name in seen_elfs: continue @@ -79,10 +95,16 @@ def system_elfs(): magic_bytes = handle.read(4) if magic_bytes != b'\x7fELF': valid_elf = False + elf_class = handle.read(1) + if elf_class != b'\x02': # ELF64 + valid_elf = False except Exception: continue if not valid_elf: continue + if not os.path.isfile(canonical_name): + continue + seen_elfs.add(canonical_name) yield (canonical_name, elftype) diff --git a/stats/requirements.txt b/stats/requirements.txt index 545f4bd..6df6fc4 100644 --- a/stats/requirements.txt +++ b/stats/requirements.txt @@ -1,2 +1 @@ git+https://github.com/eliben/pyelftools -git+https://github.com/uqfoundation/pathos diff --git a/stats/stats_accu.py b/stats/stats_accu.py index f1f7651..021491b 100644 --- a/stats/stats_accu.py +++ b/stats/stats_accu.py @@ -239,7 +239,8 @@ class StatsAccumulator: self.fdes = [] def add_fde(self, fde_data): - self.fdes.append(fde_data) + if fde_data: + self.fdes.append(fde_data) def get_fdes(self): return self.fdes @@ -250,7 +251,6 @@ class StatsAccumulator: def dump(self, path): dict_form = [fde.dump() for fde in self.fdes] - print(dict_form) with open(path, 'w') as handle: handle.write(json.dumps(dict_form))