diff --git a/frame-machine.cc b/frame-machine.cc new file mode 100644 index 0000000..5827522 --- /dev/null +++ b/frame-machine.cc @@ -0,0 +1,482 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include + +/* How can we "compile" the DWARF CFA instructions into C++ code? + * Perhaps the right thing to do is expand into "table" form first. + * Compiled code just needs to + * (1) use the PC to index into the table, and + * (2) evaluate the expression it finds there + * (either one of the fixed-form expressions in the usual CFA encodings, + * or a full DWARF expression). + * + * One thing to generate would simply be a (giant) switch statement, + * one label per address. + * Another, perhaps less efficient, would be a cascading if--else; + * this can be generated from templates, unlike the switch. + * How do we encode a table into a bunch of templates + * so that it compiles to fast code? We can either work from CFA operations like these:. + + DW_CFA_advance_loc + DW_CFA_advance_loc1 + DW_CFA_advance_loc2 + DW_CFA_def_cfa + DW_CFA_def_cfa_expression (DW_OP_breg7 (rsp) + DW_CFA_def_cfa_offset + DW_CFA_def_cfa_register + DW_CFA_nop + DW_CFA_offset + DW_CFA_remember_state + DW_CFA_restore + DW_CFA_restore_state + + ... or we can work from a decoded table. + +00003b80 0000000000000024 00003b84 FDE cie=00000000 pc=000000000002d8e6..000000000002dc87 + LOC CFA rbx rbp ra +000000000002d8e6 rsp+8 u u c-8 +000000000002d8e7 rsp+16 u c-16 c-8 +000000000002d8ea rbp+16 u c-16 c-8 +000000000002d8f2 rbp+16 c-24 c-16 c-8 +000000000002dc86 rsp+8 c-24 c-16 c-8 + + What's the object that we want to get out? We're doing this: + static int walk_stack(Cursor& c, const Action& a = Action()) + { + a.act(c); + if (!ValidityCheck::check(c)) abort(); + int err = StepRoutine::step(a, c); + if (err) return 1; + void *lower, *lower; + int (*next_walker)(Cursor&, const Action &) = Dispatcher::lookup(a, c, &lower, &upper); + return next_walker(c, a); + } + + ... so we want an implementation of some or all of + int walk_stack(Cursor& c, const Action& a = Action()) + StepRoutine::step(a, c); + Dispatcher::lookup(a, c, &lower, &upper); + + ... where our blind frame pointer does + the above walk_stack + step that is basically + c.rsp() = (uint64_t) (((void**)c.rbp()) + 2); + c.rip() = (uint64_t) *(((void**)c.rbp()) + 1); + c.rbp() = (uint64_t) *((void**)c.rbp()); + lookup that just returns the same. + + So we're allowed to generate + ... a walk_stack that is optimised for particular frames + ... a stepper that encodes the CFA table (as a function from context to context) + ... or just a row in the CFA table? + ... or just a *cell* in the CFA table? no, must + ... a dispatcher that is optimised for particular frames and walkers (and may do caching). + + Remember the inline caching idea: each function has its own stepper, + and may also have its own dispatcher who code is an inline-cached branch + to the next walker / stepper. The branch is guarded by tests on the next + rip -- that it is within the range expected. So the branch is specialised to + a particular caller. Could any other flavour of guard be efficient? + Maybe whether the caller is in the same object? i.e. instead of "blind" + walker which assumes the same walker/dispatcher for all parent frames, + a "DSO-homogeneous" walker which quickly checks that the caller is in the same DSO, + and avoids the indirect call if so? This is only useful if we *can* avoid the + indirect call, so for generic steppers rather than per-frame-compiled steppers. + + Another avenue is to think of each row in the CFA table as a stepper. + The frame pointer stepper is simply a fixed row -- rsp is at CFA+16, ra at CFA+8, + rbp at CFA. + So maybe + dispatchers are per-DSO or at least coarser-grained things + steppers are simply rows in the table and look like the c.rsp() = c....() + ...; code above + an eh_frame unwinder could use a memtable-like lookup + ... maybe coarse-grained by looking at bit prefixes? + i.e. a longest-prefix-match scheme? YES but how to memtablify this? + one memtable per prefix length? + + + */ + +/* What would an efficient dispatcher look like? + * Space-wise, a typical eh_frame section is about 10% the size of a \.text section. + * Logically, + * what we want is a big jump table keyed on the program counter. + * Each entry in the table points to an unwind routine, + * i.e. code which twiddles registers. + * Hypothesis: the routines themselves can be deduplicated aggressively, + * because they're simple things like "*(rbp + 8)" or "rsp - 16" or whatever. + * Even when backwalking many registers, they should dedup okay. + * + * How can we build the table itself efficiently? + * Say we scale the instruction pointer down by 16 bytes, + * and have two bytes in the table for each such window. That's a 12.5% space overhead. + * Each two bytes is an offset (perhaps re-up-scaled) into the unwind routine table. + * Two bytes should be enough if we really did aggressively deduplicate the routines... + * If the 16-byte window has ambiguity, we need to point into a more complex, + * ambiguity-resolving routine... but hope that we don't need many of those. + * To unwind "live" regs, we callq table+((offset_table[$rip])*scale) + * ... though do we want a separate mcontext_t-based version? + * NOTE that getcontext/setcontext *should* be non-trapping, user-level-only operations + */ + +/* What would a fast "unwind compiler" for stack frames look like? + + * It has to encode the fact that different registers may be known. + + * So, for example, a frame pointer unwinder + * requires an initial RBP, RSP and RIP, + * and yields a next-frame RBP, RSP and RIP. + * + * We might want to use a bitmap of "knownness", but then + * we can't have the C++ template "more specific" rule + * do the right thing. + * + * Instead we can use something like the following. + */ + +/* At run time, we want our stack walker to look like the following. + * + * - Snapshot initial machine state (say rsp, rip). + * - Invoke stepper, yielding higher-frame state. + * - Use rip to look up new stepper. + * - Invoke new stepper and repeat. + * + * - Can we have each stepper cache one or more "next steppers"? + * YES. This could be an inline cache. + * We want the cache-miss case to go out-of-line. + * Then a frame-pointer unwinder does no lookup; it just says "use fp stepper!". + * This should go as fast as our existing fake-libunwind stepper in liballocs. + * We need to pass the lookup function as a template argument, + * in order for this to get elaborated down + + */ + +typedef int cb_t(void *sp, void *ip, void *arg); + + +/* This is a generic walker + * which implements a recursive walk with caching, + * and uses the template arguments + * to control + * cache size, step logic, step validity guard, and next-stepper dispatch. + + + * Are we caching in the right place? + * Our stack_walker fixes a StepRoutine + * but can delegate to a different stack walker, built from a different StepRoutine. + * So if using DWARF, we generate one stack_walker per function; + * each one has a validity check, + * a cache of zero or more "likely next walkers", + * a dispatcher for deciding the actual next walker. + * + * What's wrong with this: + * - we generate per-function *walk* code, not merely per-function step code + * - caching logically belongs with the dispatcher + * - the callback is dispatched at run time + * - the representations of machine state and validity are fixed at mcontext_t + * + * What we really want is + * - to generate per-function step-and-dispatch + * - i.e. part of the "context" that the stepper gives us back + * is the *next* stepper + * -- which it could have got from a (per-stepper, hence per-function) cache + * or just by "blind" selection, or + * */ + +// template +// struct stack_walker +// { +// int walk_stack(const Cursor &cursor = Cursor()) +// { +// Action::act(cursor); +// step_fn *next = Stepper::step(cursor); // this might yield a *new* stepper, +// // but we want to allow statically fixing the stepper by inlining +// walk_stack_from(cursor); +// } +// }; + +struct mcursor +{ + mcontext_t state; + mcontext_t validity; + long long int& rip() { return state.gregs[REG_RIP]; } + long long int& rsp() { return state.gregs[REG_RSP]; } + long long int& rbp() { return state.gregs[REG_RBP]; } +}; + +template +struct caching_stack_walker +{ + static int walk_stack(Cursor& c, const Action& a = Action()) + { + int (*my_addr)(Cursor& c, const Action&) = walk_stack; + + /* Call the callback first. */ + a.act(c); + + /* Check we have the validity to run the step routine. */ + if (!ValidityCheck::check(c)) abort(); + + int err = StepRoutine::step(a, c); + if (err) return 1; + + /* Delegate: we check the cache first */ + static struct + { + //std::function walker; + int (*walker)(Cursor&, const Action &); + void *lower; + void *upper; + } cache[CacheSize]; + static unsigned cache_mru; + + for (unsigned i = 0; i < CacheSize; ++i) + { + if ((void*) c.rip() >= cache[i].lower && (void*) c.rip() < cache[i].upper + && cache[i].walker) + { + cache_mru = i; + return cache[i].walker(c, a); + } + } + + /* If the cache fails, call the Dispatcher to indirect-call the next stepper. + * How does the validity factor in? + * It's just a check: whatever stepper we look up, it'll have some + * validity requirements, and it'll check them. However, since + * the Dispatcher call *might* be inlined, we can propagate the validity + * information through template arguments and eliminate the check + * in the frame pointer case. + */ + void *lower; + void *upper; + //std::function next_walker + int (*next_walker)(Cursor&, const Action &) + = Dispatcher::lookup(a, c, &lower, &upper); + + if (next_walker != my_addr) __builtin_unreachable(); + + cache[(cache_mru + 1) % CacheSize] = { next_walker, lower, upper }; + + return next_walker(c, a); + } +}; +template +struct stack_walker +{ + static int walk_stack(Cursor& c, const Action& a = Action()) + { + //int (*my_addr)(Cursor& c, const Action&) = walk_stack; + + /* Call the callback first. */ + a.act(c); + + /* Check we have the validity to run the step routine. */ + if (!ValidityCheck::check(c)) abort(); + + int err = StepRoutine::step(a, c); + if (err) return 1; + + /* Call the Dispatcher to indirect-call the next stepper. */ + void *lower; + void *upper; + //std::function next_walker + int (*next_walker)(Cursor&, const Action &) + = Dispatcher::lookup(a, c, &lower, &upper); + + //if (next_walker != my_addr) __builtin_unreachable(); + + return next_walker(c, a); + } +}; + +/* Now we instantiate this with the stupidest-but-fastest stack walker + * around: the blind frame-pointer stepper. */ + +struct blind_frame_pointer_validity_checker +{ + static bool check(mcursor& c) { return true; } +}; + +struct blind_frame_pointer_dispatcher; // see below + +struct blind_frame_pointer_stepper +{ + template + static int step(const Action& a, mcursor& c) + { + // does our bp look sane? + if ((uintptr_t) c.rbp() < (uintptr_t) c.rsp() + || (uintptr_t) c.rbp() - (uintptr_t) c.rsp() > 0x10000000ul) + { + // looks dodgy + return -1; + } + // FIXME: if bp == sp, perhaps that means we should stop? or mark bp invalid? + + // the next-higher ip is the return addr of the frame, i.e. 4(%eip) + // NOTE(Theo): 4(%rbp), right? + void *return_addr = *(((void**)c.rbp()) + 1); + void *sp = (((void**)c.rbp()) + 2); + void *bp = *((void**)c.rbp()); + + // error detection/handling -- communicate using return value + if ((uintptr_t) sp <= (uintptr_t) c.rsp() + || (uintptr_t) sp - (uintptr_t) c.rsp() > 0x10000000ul) + { + // looks dodgy + return -1; + } + + // don't check rbp here -- if it's bad, rsp and rip might still be good + c.rsp() = (uint64_t) sp; + c.rbp() = (uint64_t) bp; + c.rip() = (uint64_t) return_addr; + return 0; + } + +#define SANE_BP_OR_NULL(bp, sp) \ + (((char*) (bp) > (char*) (sp) && ((char*) (bp) - (char*) (sp)) < 0x10000) \ + ? (void*) (bp) \ + : 0) + +#pragma GCC push_options +#pragma GCC optimize ("no-omit-frame-pointer") + static int initcontext(mcursor& c) + __attribute__((noinline,visibility("protected"))) + { + /* The initial state of the cursor should be such that + * if the caller does + * + * getcontext(...) + * then + * state->rsp + * + * they get their own stack pointer. */ + void *caller_sp_minus_two_words = __builtin_frame_address(0); + void *caller_bp; + void *caller_sp; + void *current_return_addr; + + assert(caller_sp_minus_two_words != NULL); + current_return_addr = __builtin_return_address(0); + /* We get the old break pointer by dereferencing the addr found at 0(%rbp) */ + caller_bp = *((void**)caller_sp_minus_two_words); + assert(caller_bp != 0); + /* We get the caller stack pointer by taking the addr, and adjusting for + * the arguments & return addr to this function (two words). */ + caller_sp = (((void**)caller_sp_minus_two_words) + 2); + + c.rsp() = (uint64_t) caller_sp; + c.validity.gregs[REG_RSP] = 1; + c.rbp() = (uint64_t) SANE_BP_OR_NULL(caller_bp, caller_sp); + c.validity.gregs[REG_RBP] = (c.rbp()) ? 1 : 0; + c.rip() = (uint64_t) current_return_addr; + c.validity.gregs[REG_RIP] = 1; + + return 0; + } +#pragma GCC pop_options + +#undef SANE_BP_OR_NULL +}; + +struct blind_frame_pointer_dispatcher +{ + // typedef int (*walker_fn)(Cursor&, const Action&); + template + static + //std::function + int + (*lookup (const Action& a, Cursor &c, void **out_lower, void **out_upper)) + (Cursor&, const Action&) + { + *out_lower = nullptr; + *out_upper = nullptr; + return &caching_stack_walker< + 1u /* CacheSize */, + Cursor, + blind_frame_pointer_stepper /* StepRoutine */, + blind_frame_pointer_validity_checker /* ValidityCheck */, + blind_frame_pointer_dispatcher /* Dispatcher */, + Action + >::walk_stack; + } +}; + +struct print_frame_action +{ + int act(mcursor& c) const + { + Dl_info i; + int success = dladdr((void *) c.rip(), &i); + printf("sp=%p %s (%p)\n", (void*) c.rsp(), (success && i.dli_sname) ? i.dli_sname : "(no symbol)", (void*) c.rip()); + return 0; + } +}; +struct do_nothing_action +{ + int act(mcursor& c) const + { + return 0; + } +}; + +int g(int x) +{ + //mcursor c; + //int ret = getcontext(&context); /* FIXME: want a stepper-specific "initial state" */ + int ret;// = blind_frame_pointer_stepper::initcontext(c); + //if (ret) abort(); + //caching_stack_walker< + // 1u /* CacheSize */, + // mcursor, + // blind_frame_pointer_stepper /* StepRoutine */, + // blind_frame_pointer_validity_checker /* ValidityCheck */, + // blind_frame_pointer_dispatcher /* Dispatcher */, + // print_frame_action /* Action */ + //>::walk_stack(c); + + mcursor c_2; + ret = x; + ret = blind_frame_pointer_stepper::initcontext(c_2); + if (ret) abort(); + return + stack_walker< + mcursor, + blind_frame_pointer_stepper /* StepRoutine */, + blind_frame_pointer_validity_checker /* ValidityCheck */, + blind_frame_pointer_dispatcher /* Dispatcher */, + print_frame_action /* Action */ + >::walk_stack(c_2); +} +int do_useless_stuff(int rec_depth) { + if(rec_depth <= 0) { + g(rec_depth); + return 1; + } + int result = rec_depth * do_useless_stuff(rec_depth - 1); + int values[] = {0, 0, 0, result}; + return values[3]; +} +void f() +{ + if(do_useless_stuff(10) == 42) { + puts("This never happens, but gcc won't optimize then :>"); + } +} +int main() +{ + /* Unwind some shizzle. */ + f(); + + return 0; +}