Add original work by Stephen Kell
This commit is contained in:
parent
69aea630a0
commit
77bd72cbf8
1 changed files with 482 additions and 0 deletions
482
frame-machine.cc
Normal file
482
frame-machine.cc
Normal file
|
@ -0,0 +1,482 @@
|
|||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include <ucontext.h>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <dlfcn.h>
|
||||
#include <cassert>
|
||||
#include <functional>
|
||||
|
||||
/* How can we "compile" the DWARF CFA instructions into C++ code?
|
||||
* Perhaps the right thing to do is expand into "table" form first.
|
||||
* Compiled code just needs to
|
||||
* (1) use the PC to index into the table, and
|
||||
* (2) evaluate the expression it finds there
|
||||
* (either one of the fixed-form expressions in the usual CFA encodings,
|
||||
* or a full DWARF expression).
|
||||
*
|
||||
* One thing to generate would simply be a (giant) switch statement,
|
||||
* one label per address.
|
||||
* Another, perhaps less efficient, would be a cascading if--else;
|
||||
* this can be generated from templates, unlike the switch.
|
||||
* How do we encode a table into a bunch of templates
|
||||
* so that it compiles to fast code? We can either work from CFA operations like these:.
|
||||
|
||||
DW_CFA_advance_loc
|
||||
DW_CFA_advance_loc1
|
||||
DW_CFA_advance_loc2
|
||||
DW_CFA_def_cfa
|
||||
DW_CFA_def_cfa_expression (DW_OP_breg7 (rsp)
|
||||
DW_CFA_def_cfa_offset
|
||||
DW_CFA_def_cfa_register
|
||||
DW_CFA_nop
|
||||
DW_CFA_offset
|
||||
DW_CFA_remember_state
|
||||
DW_CFA_restore
|
||||
DW_CFA_restore_state
|
||||
|
||||
... or we can work from a decoded table.
|
||||
|
||||
00003b80 0000000000000024 00003b84 FDE cie=00000000 pc=000000000002d8e6..000000000002dc87
|
||||
LOC CFA rbx rbp ra
|
||||
000000000002d8e6 rsp+8 u u c-8
|
||||
000000000002d8e7 rsp+16 u c-16 c-8
|
||||
000000000002d8ea rbp+16 u c-16 c-8
|
||||
000000000002d8f2 rbp+16 c-24 c-16 c-8
|
||||
000000000002dc86 rsp+8 c-24 c-16 c-8
|
||||
|
||||
What's the object that we want to get out? We're doing this:
|
||||
static int walk_stack(Cursor& c, const Action& a = Action())
|
||||
{
|
||||
a.act(c);
|
||||
if (!ValidityCheck::check(c)) abort();
|
||||
int err = StepRoutine::step(a, c);
|
||||
if (err) return 1;
|
||||
void *lower, *lower;
|
||||
int (*next_walker)(Cursor&, const Action &) = Dispatcher::lookup(a, c, &lower, &upper);
|
||||
return next_walker(c, a);
|
||||
}
|
||||
|
||||
... so we want an implementation of some or all of
|
||||
int walk_stack(Cursor& c, const Action& a = Action())
|
||||
StepRoutine::step(a, c);
|
||||
Dispatcher::lookup(a, c, &lower, &upper);
|
||||
|
||||
... where our blind frame pointer does
|
||||
the above walk_stack
|
||||
step that is basically
|
||||
c.rsp() = (uint64_t) (((void**)c.rbp()) + 2);
|
||||
c.rip() = (uint64_t) *(((void**)c.rbp()) + 1);
|
||||
c.rbp() = (uint64_t) *((void**)c.rbp());
|
||||
lookup that just returns the same.
|
||||
|
||||
So we're allowed to generate
|
||||
... a walk_stack that is optimised for particular frames
|
||||
... a stepper that encodes the CFA table (as a function from context to context)
|
||||
... or just a row in the CFA table?
|
||||
... or just a *cell* in the CFA table? no, must
|
||||
... a dispatcher that is optimised for particular frames and walkers (and may do caching).
|
||||
|
||||
Remember the inline caching idea: each function has its own stepper,
|
||||
and may also have its own dispatcher who code is an inline-cached branch
|
||||
to the next walker / stepper. The branch is guarded by tests on the next
|
||||
rip -- that it is within the range expected. So the branch is specialised to
|
||||
a particular caller. Could any other flavour of guard be efficient?
|
||||
Maybe whether the caller is in the same object? i.e. instead of "blind"
|
||||
walker which assumes the same walker/dispatcher for all parent frames,
|
||||
a "DSO-homogeneous" walker which quickly checks that the caller is in the same DSO,
|
||||
and avoids the indirect call if so? This is only useful if we *can* avoid the
|
||||
indirect call, so for generic steppers rather than per-frame-compiled steppers.
|
||||
|
||||
Another avenue is to think of each row in the CFA table as a stepper.
|
||||
The frame pointer stepper is simply a fixed row -- rsp is at CFA+16, ra at CFA+8,
|
||||
rbp at CFA.
|
||||
So maybe
|
||||
dispatchers are per-DSO or at least coarser-grained things
|
||||
steppers are simply rows in the table and look like the c.rsp() = c....() + ...; code above
|
||||
an eh_frame unwinder could use a memtable-like lookup
|
||||
... maybe coarse-grained by looking at bit prefixes?
|
||||
i.e. a longest-prefix-match scheme? YES but how to memtablify this?
|
||||
one memtable per prefix length?
|
||||
|
||||
|
||||
*/
|
||||
|
||||
/* What would an efficient dispatcher look like?
|
||||
* Space-wise, a typical eh_frame section is about 10% the size of a \.text section.
|
||||
* Logically,
|
||||
* what we want is a big jump table keyed on the program counter.
|
||||
* Each entry in the table points to an unwind routine,
|
||||
* i.e. code which twiddles registers.
|
||||
* Hypothesis: the routines themselves can be deduplicated aggressively,
|
||||
* because they're simple things like "*(rbp + 8)" or "rsp - 16" or whatever.
|
||||
* Even when backwalking many registers, they should dedup okay.
|
||||
*
|
||||
* How can we build the table itself efficiently?
|
||||
* Say we scale the instruction pointer down by 16 bytes,
|
||||
* and have two bytes in the table for each such window. That's a 12.5% space overhead.
|
||||
* Each two bytes is an offset (perhaps re-up-scaled) into the unwind routine table.
|
||||
* Two bytes should be enough if we really did aggressively deduplicate the routines...
|
||||
* If the 16-byte window has ambiguity, we need to point into a more complex,
|
||||
* ambiguity-resolving routine... but hope that we don't need many of those.
|
||||
* To unwind "live" regs, we callq table+((offset_table[$rip])*scale)
|
||||
* ... though do we want a separate mcontext_t-based version?
|
||||
* NOTE that getcontext/setcontext *should* be non-trapping, user-level-only operations
|
||||
*/
|
||||
|
||||
/* What would a fast "unwind compiler" for stack frames look like?
|
||||
|
||||
* It has to encode the fact that different registers may be known.
|
||||
|
||||
* So, for example, a frame pointer unwinder
|
||||
* requires an initial RBP, RSP and RIP,
|
||||
* and yields a next-frame RBP, RSP and RIP.
|
||||
*
|
||||
* We might want to use a bitmap of "knownness", but then
|
||||
* we can't have the C++ template "more specific" rule
|
||||
* do the right thing.
|
||||
*
|
||||
* Instead we can use something like the following.
|
||||
*/
|
||||
|
||||
/* At run time, we want our stack walker to look like the following.
|
||||
*
|
||||
* - Snapshot initial machine state (say rsp, rip).
|
||||
* - Invoke stepper, yielding higher-frame state.
|
||||
* - Use rip to look up new stepper.
|
||||
* - Invoke new stepper and repeat.
|
||||
*
|
||||
* - Can we have each stepper cache one or more "next steppers"?
|
||||
* YES. This could be an inline cache.
|
||||
* We want the cache-miss case to go out-of-line.
|
||||
* Then a frame-pointer unwinder does no lookup; it just says "use fp stepper!".
|
||||
* This should go as fast as our existing fake-libunwind stepper in liballocs.
|
||||
* We need to pass the lookup function as a template argument,
|
||||
* in order for this to get elaborated down
|
||||
|
||||
*/
|
||||
|
||||
typedef int cb_t(void *sp, void *ip, void *arg);
|
||||
|
||||
|
||||
/* This is a generic walker
|
||||
* which implements a recursive walk with caching,
|
||||
* and uses the template arguments
|
||||
* to control
|
||||
* cache size, step logic, step validity guard, and next-stepper dispatch.
|
||||
|
||||
|
||||
* Are we caching in the right place?
|
||||
* Our stack_walker fixes a StepRoutine
|
||||
* but can delegate to a different stack walker, built from a different StepRoutine.
|
||||
* So if using DWARF, we generate one stack_walker per function;
|
||||
* each one has a validity check,
|
||||
* a cache of zero or more "likely next walkers",
|
||||
* a dispatcher for deciding the actual next walker.
|
||||
*
|
||||
* What's wrong with this:
|
||||
* - we generate per-function *walk* code, not merely per-function step code
|
||||
* - caching logically belongs with the dispatcher
|
||||
* - the callback is dispatched at run time
|
||||
* - the representations of machine state and validity are fixed at mcontext_t
|
||||
*
|
||||
* What we really want is
|
||||
* - to generate per-function step-and-dispatch
|
||||
* - i.e. part of the "context" that the stepper gives us back
|
||||
* is the *next* stepper
|
||||
* -- which it could have got from a (per-stepper, hence per-function) cache
|
||||
* or just by "blind" selection, or
|
||||
* */
|
||||
|
||||
// template <typename Action, typename Cursor, typename Stepper>
|
||||
// struct stack_walker
|
||||
// {
|
||||
// int walk_stack(const Cursor &cursor = Cursor())
|
||||
// {
|
||||
// Action::act(cursor);
|
||||
// step_fn *next = Stepper::step(cursor); // this might yield a *new* stepper,
|
||||
// // but we want to allow statically fixing the stepper by inlining
|
||||
// walk_stack_from(cursor);
|
||||
// }
|
||||
// };
|
||||
|
||||
struct mcursor
|
||||
{
|
||||
mcontext_t state;
|
||||
mcontext_t validity;
|
||||
long long int& rip() { return state.gregs[REG_RIP]; }
|
||||
long long int& rsp() { return state.gregs[REG_RSP]; }
|
||||
long long int& rbp() { return state.gregs[REG_RBP]; }
|
||||
};
|
||||
|
||||
template <unsigned CacheSize, typename Cursor, typename StepRoutine, typename ValidityCheck, typename Dispatcher, typename Action>
|
||||
struct caching_stack_walker
|
||||
{
|
||||
static int walk_stack(Cursor& c, const Action& a = Action())
|
||||
{
|
||||
int (*my_addr)(Cursor& c, const Action&) = walk_stack;
|
||||
|
||||
/* Call the callback first. */
|
||||
a.act(c);
|
||||
|
||||
/* Check we have the validity to run the step routine. */
|
||||
if (!ValidityCheck::check(c)) abort();
|
||||
|
||||
int err = StepRoutine::step(a, c);
|
||||
if (err) return 1;
|
||||
|
||||
/* Delegate: we check the cache first */
|
||||
static struct
|
||||
{
|
||||
//std::function<int(Cursor&, const Action &)> walker;
|
||||
int (*walker)(Cursor&, const Action &);
|
||||
void *lower;
|
||||
void *upper;
|
||||
} cache[CacheSize];
|
||||
static unsigned cache_mru;
|
||||
|
||||
for (unsigned i = 0; i < CacheSize; ++i)
|
||||
{
|
||||
if ((void*) c.rip() >= cache[i].lower && (void*) c.rip() < cache[i].upper
|
||||
&& cache[i].walker)
|
||||
{
|
||||
cache_mru = i;
|
||||
return cache[i].walker(c, a);
|
||||
}
|
||||
}
|
||||
|
||||
/* If the cache fails, call the Dispatcher to indirect-call the next stepper.
|
||||
* How does the validity factor in?
|
||||
* It's just a check: whatever stepper we look up, it'll have some
|
||||
* validity requirements, and it'll check them. However, since
|
||||
* the Dispatcher call *might* be inlined, we can propagate the validity
|
||||
* information through template arguments and eliminate the check
|
||||
* in the frame pointer case.
|
||||
*/
|
||||
void *lower;
|
||||
void *upper;
|
||||
//std::function<int(Cursor&, const Action &)> next_walker
|
||||
int (*next_walker)(Cursor&, const Action &)
|
||||
= Dispatcher::lookup(a, c, &lower, &upper);
|
||||
|
||||
if (next_walker != my_addr) __builtin_unreachable();
|
||||
|
||||
cache[(cache_mru + 1) % CacheSize] = { next_walker, lower, upper };
|
||||
|
||||
return next_walker(c, a);
|
||||
}
|
||||
};
|
||||
template <typename Cursor, typename StepRoutine, typename ValidityCheck, typename Dispatcher, typename Action>
|
||||
struct stack_walker
|
||||
{
|
||||
static int walk_stack(Cursor& c, const Action& a = Action())
|
||||
{
|
||||
//int (*my_addr)(Cursor& c, const Action&) = walk_stack;
|
||||
|
||||
/* Call the callback first. */
|
||||
a.act(c);
|
||||
|
||||
/* Check we have the validity to run the step routine. */
|
||||
if (!ValidityCheck::check(c)) abort();
|
||||
|
||||
int err = StepRoutine::step(a, c);
|
||||
if (err) return 1;
|
||||
|
||||
/* Call the Dispatcher to indirect-call the next stepper. */
|
||||
void *lower;
|
||||
void *upper;
|
||||
//std::function<int(Cursor&, const Action &)> next_walker
|
||||
int (*next_walker)(Cursor&, const Action &)
|
||||
= Dispatcher::lookup(a, c, &lower, &upper);
|
||||
|
||||
//if (next_walker != my_addr) __builtin_unreachable();
|
||||
|
||||
return next_walker(c, a);
|
||||
}
|
||||
};
|
||||
|
||||
/* Now we instantiate this with the stupidest-but-fastest stack walker
|
||||
* around: the blind frame-pointer stepper. */
|
||||
|
||||
struct blind_frame_pointer_validity_checker
|
||||
{
|
||||
static bool check(mcursor& c) { return true; }
|
||||
};
|
||||
|
||||
struct blind_frame_pointer_dispatcher; // see below
|
||||
|
||||
struct blind_frame_pointer_stepper
|
||||
{
|
||||
template <typename Action>
|
||||
static int step(const Action& a, mcursor& c)
|
||||
{
|
||||
// does our bp look sane?
|
||||
if ((uintptr_t) c.rbp() < (uintptr_t) c.rsp()
|
||||
|| (uintptr_t) c.rbp() - (uintptr_t) c.rsp() > 0x10000000ul)
|
||||
{
|
||||
// looks dodgy
|
||||
return -1;
|
||||
}
|
||||
// FIXME: if bp == sp, perhaps that means we should stop? or mark bp invalid?
|
||||
|
||||
// the next-higher ip is the return addr of the frame, i.e. 4(%eip)
|
||||
// NOTE(Theo): 4(%rbp), right?
|
||||
void *return_addr = *(((void**)c.rbp()) + 1);
|
||||
void *sp = (((void**)c.rbp()) + 2);
|
||||
void *bp = *((void**)c.rbp());
|
||||
|
||||
// error detection/handling -- communicate using return value
|
||||
if ((uintptr_t) sp <= (uintptr_t) c.rsp()
|
||||
|| (uintptr_t) sp - (uintptr_t) c.rsp() > 0x10000000ul)
|
||||
{
|
||||
// looks dodgy
|
||||
return -1;
|
||||
}
|
||||
|
||||
// don't check rbp here -- if it's bad, rsp and rip might still be good
|
||||
c.rsp() = (uint64_t) sp;
|
||||
c.rbp() = (uint64_t) bp;
|
||||
c.rip() = (uint64_t) return_addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define SANE_BP_OR_NULL(bp, sp) \
|
||||
(((char*) (bp) > (char*) (sp) && ((char*) (bp) - (char*) (sp)) < 0x10000) \
|
||||
? (void*) (bp) \
|
||||
: 0)
|
||||
|
||||
#pragma GCC push_options
|
||||
#pragma GCC optimize ("no-omit-frame-pointer")
|
||||
static int initcontext(mcursor& c)
|
||||
__attribute__((noinline,visibility("protected")))
|
||||
{
|
||||
/* The initial state of the cursor should be such that
|
||||
* if the caller does
|
||||
*
|
||||
* getcontext(...)
|
||||
* then
|
||||
* state->rsp
|
||||
*
|
||||
* they get their own stack pointer. */
|
||||
void *caller_sp_minus_two_words = __builtin_frame_address(0);
|
||||
void *caller_bp;
|
||||
void *caller_sp;
|
||||
void *current_return_addr;
|
||||
|
||||
assert(caller_sp_minus_two_words != NULL);
|
||||
current_return_addr = __builtin_return_address(0);
|
||||
/* We get the old break pointer by dereferencing the addr found at 0(%rbp) */
|
||||
caller_bp = *((void**)caller_sp_minus_two_words);
|
||||
assert(caller_bp != 0);
|
||||
/* We get the caller stack pointer by taking the addr, and adjusting for
|
||||
* the arguments & return addr to this function (two words). */
|
||||
caller_sp = (((void**)caller_sp_minus_two_words) + 2);
|
||||
|
||||
c.rsp() = (uint64_t) caller_sp;
|
||||
c.validity.gregs[REG_RSP] = 1;
|
||||
c.rbp() = (uint64_t) SANE_BP_OR_NULL(caller_bp, caller_sp);
|
||||
c.validity.gregs[REG_RBP] = (c.rbp()) ? 1 : 0;
|
||||
c.rip() = (uint64_t) current_return_addr;
|
||||
c.validity.gregs[REG_RIP] = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#pragma GCC pop_options
|
||||
|
||||
#undef SANE_BP_OR_NULL
|
||||
};
|
||||
|
||||
struct blind_frame_pointer_dispatcher
|
||||
{
|
||||
// typedef int (*walker_fn)(Cursor&, const Action&);
|
||||
template <typename Action, typename Cursor>
|
||||
static
|
||||
//std::function<int(Cursor&, const Action&)>
|
||||
int
|
||||
(*lookup (const Action& a, Cursor &c, void **out_lower, void **out_upper))
|
||||
(Cursor&, const Action&)
|
||||
{
|
||||
*out_lower = nullptr;
|
||||
*out_upper = nullptr;
|
||||
return &caching_stack_walker<
|
||||
1u /* CacheSize */,
|
||||
Cursor,
|
||||
blind_frame_pointer_stepper /* StepRoutine */,
|
||||
blind_frame_pointer_validity_checker /* ValidityCheck */,
|
||||
blind_frame_pointer_dispatcher /* Dispatcher */,
|
||||
Action
|
||||
>::walk_stack;
|
||||
}
|
||||
};
|
||||
|
||||
struct print_frame_action
|
||||
{
|
||||
int act(mcursor& c) const
|
||||
{
|
||||
Dl_info i;
|
||||
int success = dladdr((void *) c.rip(), &i);
|
||||
printf("sp=%p %s (%p)\n", (void*) c.rsp(), (success && i.dli_sname) ? i.dli_sname : "(no symbol)", (void*) c.rip());
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
struct do_nothing_action
|
||||
{
|
||||
int act(mcursor& c) const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
int g(int x)
|
||||
{
|
||||
//mcursor c;
|
||||
//int ret = getcontext(&context); /* FIXME: want a stepper-specific "initial state" */
|
||||
int ret;// = blind_frame_pointer_stepper::initcontext(c);
|
||||
//if (ret) abort();
|
||||
//caching_stack_walker<
|
||||
// 1u /* CacheSize */,
|
||||
// mcursor,
|
||||
// blind_frame_pointer_stepper /* StepRoutine */,
|
||||
// blind_frame_pointer_validity_checker /* ValidityCheck */,
|
||||
// blind_frame_pointer_dispatcher /* Dispatcher */,
|
||||
// print_frame_action /* Action */
|
||||
//>::walk_stack(c);
|
||||
|
||||
mcursor c_2;
|
||||
ret = x;
|
||||
ret = blind_frame_pointer_stepper::initcontext(c_2);
|
||||
if (ret) abort();
|
||||
return
|
||||
stack_walker<
|
||||
mcursor,
|
||||
blind_frame_pointer_stepper /* StepRoutine */,
|
||||
blind_frame_pointer_validity_checker /* ValidityCheck */,
|
||||
blind_frame_pointer_dispatcher /* Dispatcher */,
|
||||
print_frame_action /* Action */
|
||||
>::walk_stack(c_2);
|
||||
}
|
||||
int do_useless_stuff(int rec_depth) {
|
||||
if(rec_depth <= 0) {
|
||||
g(rec_depth);
|
||||
return 1;
|
||||
}
|
||||
int result = rec_depth * do_useless_stuff(rec_depth - 1);
|
||||
int values[] = {0, 0, 0, result};
|
||||
return values[3];
|
||||
}
|
||||
void f()
|
||||
{
|
||||
if(do_useless_stuff(10) == 42) {
|
||||
puts("This never happens, but gcc won't optimize then :>");
|
||||
}
|
||||
}
|
||||
int main()
|
||||
{
|
||||
/* Unwind some shizzle. */
|
||||
f();
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in a new issue