stats: various modifications

This commit is contained in:
Théophile Bastian 2018-08-08 14:31:05 +02:00
parent 216e442f5b
commit 6629de9a3e
7 changed files with 369 additions and 142 deletions

3
stats/.gitignore vendored
View file

@ -1,2 +1,3 @@
venv venv
elf_data elf_data*
gathered

View file

@ -18,6 +18,7 @@ class Config:
elif args.feature == 'sample': elif args.feature == 'sample':
self.size = int(args.size) self.size = int(args.size)
self.output = args.output
elif args.feature == 'analyze': elif args.feature == 'analyze':
self.data_file = args.data_file self.data_file = args.data_file
@ -93,9 +94,9 @@ def main():
stats_accu = gather_stats.gather_system_files( stats_accu = gather_stats.gather_system_files(
config, config,
sample_size=config.size) sample_size=config.size)
stats_accu.dump(config.output)
elif config.feature == 'analyze': elif config.feature == 'analyze':
# TODO
print("Not implemented", file=sys.stderr) print("Not implemented", file=sys.stderr)
stats_accu = StatsAccumulator.load(config.data_file) stats_accu = StatsAccumulator.load(config.data_file)
sys.exit(1) sys.exit(1)

View file

@ -1,94 +1,81 @@
from elftools.common.exceptions import DWARFError
from pyelftools_overlay import system_elfs, get_cfi from pyelftools_overlay import system_elfs, get_cfi
from elftools.dwarf import callframe from elftools.dwarf import callframe
import multiprocessing import concurrent.futures
import signal
import random import random
from stats_accu import \ from stats_accu import \
StatsAccumulator, SingleFdeData, \ StatsAccumulator, SingleFdeData, FdeData, DwarfInstr
RegsList, FdeData, DwarfInstr
class FilesProcessor(multiprocessing.Process): class ProcessWrapper:
def __init__(self, elf_list, shared_queue): def __init__(self, fct):
super().__init__() self._fct = fct
self.stop_processing = False
self.processed_counter = 0
self.elf_list = elf_list
self.shared_queue = shared_queue
def stop_processing_now(self): def __call__(self, elf_descr):
self.stop_processing = True try:
path, elftype = elf_descr
def run(self): print("Processing {}".format(path))
pos = 0
for descr in self.elf_list:
if self.stop_processing:
break
self.process_single_file(descr, pos)
pos += 1
print("=== Finished {} ===".format(self.name))
return 0
def process_single_file(self, elf_descr, pos_in_list):
if self.stop_processing:
return
elf_path, elf_type = elf_descr
self.processed_counter += 1
print('[{}, {}/{}] {}'.format(
self.shared_queue.qsize(),
pos_in_list + 1,
len(self.elf_list),
elf_path))
self.process_file(elf_path, elf_type)
def process_file(self, path, elftype):
''' Process a single file '''
cfi = get_cfi(path) cfi = get_cfi(path)
if not cfi: if not cfi:
return None return None
return self._fct(path, elftype, cfi)
except DWARFError:
return None
def process_wrapper(fct):
return ProcessWrapper(fct)
@process_wrapper
def process_elf(path, elftype, cfi):
''' Process a single file '''
data = FdeData() data = FdeData()
for entry in cfi: for entry in cfi:
if isinstance(entry, callframe.CIE): # Is a CIE if isinstance(entry, callframe.CIE): # Is a CIE
self.process_cie(entry, data) process_cie(entry, data)
elif isinstance(entry, callframe.FDE): # Is a FDE elif isinstance(entry, callframe.FDE): # Is a FDE
self.process_fde(entry, data) process_fde(entry, data)
out = SingleFdeData(path, elftype, data) return SingleFdeData(path, elftype, data)
self.shared_queue.put(out)
def incr_cell(self, table, key):
def incr_cell(table, key):
''' Increments table[key], or sets it to 1 if unset ''' ''' Increments table[key], or sets it to 1 if unset '''
if key in table: if key in table:
table[key] += 1 table[key] += 1
else: else:
table[key] = 1 table[key] = 1
def process_cie(self, cie, data):
def process_cie(cie, data):
''' Process a CIE ''' ''' Process a CIE '''
pass # Nothing needed from a CIE pass # Nothing needed from a CIE
def process_fde(self, fde, data):
def process_fde(fde, data):
''' Process a FDE ''' ''' Process a FDE '''
data.fde_count += 1 data.fde_count += 1
decoded = fde.get_decoded() decoded = fde.get_decoded()
row_count = len(decoded.table) row_count = len(decoded.table)
self.incr_cell(data.fde_with_lines, row_count) incr_cell(data.fde_with_lines, row_count)
for row in decoded.table: for row in decoded.table:
self.process_reg(data.regs.cfa, row['cfa']) process_reg(data.regs.cfa, row['cfa'])
for entry in row: for entry in row:
if isinstance(entry, int): if isinstance(entry, int):
self.process_reg(data.regs.regs[entry], row[entry]) process_reg(data.regs.regs[entry], row[entry])
def process_reg(self, out_reg, reg_def):
def process_reg(out_reg, reg_def):
''' Process a register ''' ''' Process a register '''
if isinstance(reg_def, callframe.CFARule): if isinstance(reg_def, callframe.CFARule):
if reg_def.reg is not None: if reg_def.reg is not None:
@ -96,7 +83,7 @@ class FilesProcessor(multiprocessing.Process):
else: else:
pass # TODO exprs pass # TODO exprs
else: else:
self.incr_cell(out_reg.instrs, DwarfInstr.of_pyelf(reg_def.type)) incr_cell(out_reg.instrs, DwarfInstr.of_pyelf(reg_def.type))
if reg_def.type == callframe.RegisterRule.REGISTER: if reg_def.type == callframe.RegisterRule.REGISTER:
out_reg.regs[reg_def.arg] += 1 out_reg.regs[reg_def.arg] += 1
elif (reg_def.type == callframe.RegisterRule.EXPRESSION) \ elif (reg_def.type == callframe.RegisterRule.EXPRESSION) \
@ -106,17 +93,6 @@ class FilesProcessor(multiprocessing.Process):
def gather_system_files(config, sample_size=None): def gather_system_files(config, sample_size=None):
stats_accu = StatsAccumulator() stats_accu = StatsAccumulator()
processors = []
def signal_graceful_exit(sig, frame):
''' Stop gracefully now '''
nonlocal processors
print("Stopping after this ELF…")
for processor in processors:
processor.stop_processing_now()
signal.signal(signal.SIGINT, signal_graceful_exit)
elf_list = [] elf_list = []
for elf_path in system_elfs(): for elf_path in system_elfs():
@ -126,46 +102,46 @@ def gather_system_files(config, sample_size=None):
elf_list_sampled = random.sample(elf_list, sample_size) elf_list_sampled = random.sample(elf_list, sample_size)
elf_list = elf_list_sampled elf_list = elf_list_sampled
elf_count = len(elf_list)
elf_per_process = elf_count // config.cores
elf_list_slices = []
for i in range(config.cores - 1):
elf_list_slices.append(
elf_list[i * elf_per_process : (i+1) * elf_per_process])
elf_list_slices.append(
elf_list[(config.cores - 1) * elf_per_process
: config.cores * elf_per_process])
shared_queue = multiprocessing.Queue(elf_count)
for elf_range in elf_list_slices:
processors.append(FilesProcessor(elf_range, shared_queue))
if config.cores > 1: if config.cores > 1:
for processor in processors: with concurrent.futures.ProcessPoolExecutor(max_workers=config.cores)\
processor.start() as executor:
for fde in executor.map(process_elf, elf_list):
while True: stats_accu.add_fde(fde)
for processor in processors:
if processor.is_alive():
print("== Waiting {} ({} {}) ==".format(
processor.name, processor.exitcode,
processor.is_alive()))
processor.join(timeout=1)
if processor.exitcode is None:
break # Loop around
print("== Joined {} ==".format(processor.name))
terminated = True
for processor in processors:
if processor.exitcode is None:
terminated = False
if terminated:
break
else: else:
processors[0].run() # run(), not start(): in the same thread for elf in elf_list:
stats_accu.add_fde(process_elf(elf))
while not shared_queue.empty(): # Reliable because everything is joined
stats_accu.add_fde(shared_queue.get_nowait())
return stats_accu return stats_accu
def map_system_files(mapper, sample_size=None, cores=None, include=None,
elflist=None):
''' `mapper` must take (path, elf_type, cfi) '''
if cores is None:
cores = 1
if include is None:
include = []
mapper = process_wrapper(mapper)
if elflist is None:
elf_list = []
for elf_path in system_elfs():
elf_list.append(elf_path)
if sample_size is not None:
elf_list_sampled = random.sample(elf_list, sample_size)
elf_list = elf_list_sampled
elf_list += list(map(lambda x: (x, None), include))
else:
elf_list = elflist
if cores > 1:
with concurrent.futures.ProcessPoolExecutor(max_workers=cores)\
as executor:
out = executor.map(mapper, elf_list)
else:
out = map(mapper, elf_list)
return out, elf_list

228
stats/helpers.py Normal file
View file

@ -0,0 +1,228 @@
from elftools.dwarf import callframe
import gather_stats
import itertools
import functools
REGS_IDS = {
'RAX': 0,
'RDX': 1,
'RCX': 2,
'RBX': 3,
'RSI': 4,
'RDI': 5,
'RBP': 6,
'RSP': 7,
'R8': 8,
'R9': 9,
'R10': 10,
'R11': 11,
'R12': 12,
'R13': 13,
'R14': 14,
'R15': 15,
'RIP': 16
}
ID_TO_REG = [
'RAX',
'RDX',
'RCX',
'RBX',
'RSI',
'RDI',
'RBP',
'RSP',
'R8',
'R9',
'R10',
'R11',
'R12',
'R13',
'R14',
'R15',
'RIP',
]
HANDLED_REGS = list(map(lambda x: REGS_IDS[x], [
'RIP',
'RSP',
'RBP',
'RBX',
]))
ONLY_HANDLED_REGS = True # only analyzed handled regs columns
PLT_EXPR = [119, 8, 128, 0, 63, 26, 59, 42, 51, 36, 34] # Handled exp
def accumulate_regs(reg_list):
out = [0] * 17
for lst in reg_list:
for pos in range(len(lst)):
out[pos] += lst[pos]
return out
def filter_none(lst):
for x in lst:
if x:
yield x
def deco_filter_none(fct):
def wrap(lst):
return fct(filter_none(lst))
return wrap
class FdeProcessor:
def __init__(self, fct, reducer=None):
self._fct = fct
self._reducer = reducer
def __call__(self, path, elftype, cfi):
out = []
for entry in cfi:
if isinstance(entry, callframe.FDE):
decoded = entry.get_decoded()
out.append(self._fct(path, entry, decoded))
if self._reducer is not None and len(out) >= 2:
out = [self._reducer(out)]
return out
class FdeProcessorReduced:
def __init__(self, reducer):
self._reducer = reducer
def __call__(self, fct):
return FdeProcessor(fct, self._reducer)
def fde_processor(fct):
return FdeProcessor(fct)
def fde_processor_reduced(reducer):
return FdeProcessorReduced(reducer)
def is_handled_expr(expr):
if expr == PLT_EXPR:
return True
if len(expr) == 2 and 0x70 <= expr[0] <= 0x89:
if expr[0] - 0x70 in HANDLED_REGS:
return True
return False
# @fde_processor
def find_non_cfa(path, fde, decoded):
regs_seen = 0
non_handled_regs = 0
non_handled_exp = 0
cfa_dat = [0, 0] # Seen, expr
rule_type = {
callframe.RegisterRule.UNDEFINED: 0,
callframe.RegisterRule.SAME_VALUE: 0,
callframe.RegisterRule.OFFSET: 0,
callframe.RegisterRule.VAL_OFFSET: 0,
callframe.RegisterRule.REGISTER: 0,
callframe.RegisterRule.EXPRESSION: 0,
callframe.RegisterRule.VAL_EXPRESSION: 0,
callframe.RegisterRule.ARCHITECTURAL: 0,
}
problematic_paths = set()
for row in decoded.table:
for entry in row:
reg_def = row[entry]
if entry == 'cfa':
cfa_dat[0] += 1
if reg_def.expr:
cfa_dat[1] += 1
if not is_handled_expr(reg_def.expr):
non_handled_exp += 1
problematic_paths.add(path)
elif reg_def:
if reg_def.reg not in HANDLED_REGS:
non_handled_regs += 1
problematic_paths.add(path)
if not isinstance(entry, int): # CFA or PC
continue
if ONLY_HANDLED_REGS and entry not in HANDLED_REGS:
continue
rule_type[reg_def.type] += 1
reg_rule = reg_def.type
if reg_rule in [callframe.RegisterRule.OFFSET,
callframe.RegisterRule.VAL_OFFSET]:
regs_seen += 1 # CFA
elif reg_rule == callframe.RegisterRule.REGISTER:
regs_seen += 1
if reg_def.arg not in HANDLED_REGS:
problematic_paths.add(path)
non_handled_regs += 1
elif reg_rule in [callframe.RegisterRule.EXPRESSION,
callframe.RegisterRule.VAL_EXPRESSION]:
expr = reg_def.arg
if not is_handled_expr(reg_def.arg):
problematic_paths.add(path)
with open('/tmp/exprs', 'a') as handle:
handle.write('[{} - {}] {}\n'.format(
path, fde.offset,
', '.join(map(lambda x: hex(x), expr))))
non_handled_exp += 1
return (regs_seen, non_handled_regs, non_handled_exp, rule_type, cfa_dat,
problematic_paths)
def reduce_non_cfa(lst):
def merge_dict(d1, d2):
for x in d1:
d1[x] += d2[x]
return d1
def merge_list(l1, l2):
out = []
for pos in range(len(l1)): # Implicit assumption len(l1) == len(l2)
out.append(l1[pos] + l2[pos])
return out
def merge_elts(accu, elt):
accu_regs, accu_nh, accu_exp, accu_rt, accu_cfa, accu_paths = accu
elt_regs, elt_nh, elt_exp, elt_rt, elt_cfa, elf_paths = elt
return (
accu_regs + elt_regs,
accu_nh + elt_nh,
accu_exp + elt_exp,
merge_dict(accu_rt, elt_rt),
merge_list(accu_cfa, elt_cfa),
accu_paths.union(elf_paths),
)
return functools.reduce(merge_elts, lst)
@deco_filter_none
def flatten_non_cfa(result):
flat = itertools.chain.from_iterable(result)
out = reduce_non_cfa(flat)
out_cfa = {
'seen': out[4][0],
'expr': out[4][1],
'offset': out[4][0] - out[4][1],
}
out = (out[0],
(out[1], out[0] + out_cfa['offset']),
(out[2], out[3]['EXPRESSION'] + out_cfa['expr']),
out[3],
out_cfa,
out[5])
return out

View file

@ -6,6 +6,11 @@ from stats_accu import ElfType
import os import os
ELF_BLACKLIST = [
'/usr/lib/libavcodec.so',
]
def get_cfi(path): def get_cfi(path):
''' Get the CFI entries from the ELF at the provided path ''' ''' Get the CFI entries from the ELF at the provided path '''
@ -14,6 +19,7 @@ def get_cfi(path):
elf_file = ELFFile(file_handle) elf_file = ELFFile(file_handle)
if not elf_file.has_dwarf_info(): if not elf_file.has_dwarf_info():
print("No DWARF")
return None return None
dw_info = elf_file.get_dwarf_info() dw_info = elf_file.get_dwarf_info()
@ -22,12 +28,19 @@ def get_cfi(path):
elif dw_info.has_EH_CFI(): elif dw_info.has_EH_CFI():
cfis = dw_info.EH_CFI_entries() cfis = dw_info.EH_CFI_entries()
else: else:
print("No CFI")
return None return None
except ELFError: except ELFError:
print("ELF Error")
return None return None
except DWARFError: except DWARFError:
print("DWARF Error")
return None return None
except PermissionError: except PermissionError:
print("Permission Error")
return None
except KeyError:
print("Key Error")
return None return None
return cfis return cfis
@ -70,6 +83,9 @@ def system_elfs():
continue continue
canonical_name = readlink_rec(direntry.path) canonical_name = readlink_rec(direntry.path)
for blacked in ELF_BLACKLIST:
if canonical_name.startswith(blacked):
continue
if canonical_name in seen_elfs: if canonical_name in seen_elfs:
continue continue
@ -79,10 +95,16 @@ def system_elfs():
magic_bytes = handle.read(4) magic_bytes = handle.read(4)
if magic_bytes != b'\x7fELF': if magic_bytes != b'\x7fELF':
valid_elf = False valid_elf = False
elf_class = handle.read(1)
if elf_class != b'\x02': # ELF64
valid_elf = False
except Exception: except Exception:
continue continue
if not valid_elf: if not valid_elf:
continue continue
if not os.path.isfile(canonical_name):
continue
seen_elfs.add(canonical_name) seen_elfs.add(canonical_name)
yield (canonical_name, elftype) yield (canonical_name, elftype)

View file

@ -1,2 +1 @@
git+https://github.com/eliben/pyelftools git+https://github.com/eliben/pyelftools
git+https://github.com/uqfoundation/pathos

View file

@ -239,6 +239,7 @@ class StatsAccumulator:
self.fdes = [] self.fdes = []
def add_fde(self, fde_data): def add_fde(self, fde_data):
if fde_data:
self.fdes.append(fde_data) self.fdes.append(fde_data)
def get_fdes(self): def get_fdes(self):
@ -250,7 +251,6 @@ class StatsAccumulator:
def dump(self, path): def dump(self, path):
dict_form = [fde.dump() for fde in self.fdes] dict_form = [fde.dump() for fde in self.fdes]
print(dict_form)
with open(path, 'w') as handle: with open(path, 'w') as handle:
handle.write(json.dumps(dict_form)) handle.write(json.dumps(dict_form))