Add original work by Stephen Kell

2018-04-11 16:20:50 +02:00 · 2018-04-11 16:20:50 +02:00 · 77bd72cbf8
commit 77bd72cbf8
parent 69aea630a0
1 changed files with 482 additions and 0 deletions
--- a/frame-machine.cc
+++ b/frame-machine.cc
@ -0,0 +1,482 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <ucontext.h>
+#include <cstdlib>
+#include <cstdint>
+#include <cstdio>
+#include <dlfcn.h>
+#include <cassert>
+#include <functional>
+
+/* How can we "compile" the DWARF CFA instructions into C++ code?
+ * Perhaps the right thing to do is expand into "table" form first.
+ * Compiled code just needs to
+ * (1) use the PC to index into the table, and
+ * (2) evaluate the expression it finds there
+ *          (either one of the fixed-form expressions in the usual CFA encodings,
+ *           or a full DWARF expression).
+ *
+ * One thing to generate would simply be a (giant) switch statement,
+ * one label per address.
+ * Another, perhaps less efficient, would be a cascading if--else;
+ * this can be generated from templates, unlike the switch.
+ * How do we encode a table into a bunch of templates
+ * so that it compiles to fast code? We can either work from CFA operations like these:.
+
+  DW_CFA_advance_loc
+  DW_CFA_advance_loc1
+  DW_CFA_advance_loc2
+  DW_CFA_def_cfa
+  DW_CFA_def_cfa_expression (DW_OP_breg7 (rsp)
+  DW_CFA_def_cfa_offset
+  DW_CFA_def_cfa_register
+  DW_CFA_nop
+  DW_CFA_offset
+  DW_CFA_remember_state
+  DW_CFA_restore
+  DW_CFA_restore_state
+
+  ... or we can work from a decoded table.
+
+00003b80 0000000000000024 00003b84 FDE cie=00000000 pc=000000000002d8e6..000000000002dc87
+   LOC           CFA      rbx   rbp   ra
+000000000002d8e6 rsp+8    u     u     c-8
+000000000002d8e7 rsp+16   u     c-16  c-8
+000000000002d8ea rbp+16   u     c-16  c-8
+000000000002d8f2 rbp+16   c-24  c-16  c-8
+000000000002dc86 rsp+8    c-24  c-16  c-8
+
+  What's the object that we want to get out? We're doing this:
+	static int walk_stack(Cursor& c, const Action& a = Action())
+	{
+		a.act(c);
+		if (!ValidityCheck::check(c)) abort();
+		int err = StepRoutine::step(a, c);
+		if (err) return 1;
+		void *lower, *lower;
+		int (*next_walker)(Cursor&, const Action &) = Dispatcher::lookup(a, c, &lower, &upper);
+		return next_walker(c, a);
+	}
+
+ ... so we want an implementation of some or all of
+          int walk_stack(Cursor& c, const Action& a = Action())
+          StepRoutine::step(a, c);
+          Dispatcher::lookup(a, c, &lower, &upper);
+
+ ... where our blind frame pointer does
+          the above walk_stack
+		  step that is basically
+				c.rsp() = (uint64_t) (((void**)c.rbp()) + 2);
+				c.rip() = (uint64_t) *(((void**)c.rbp()) + 1);
+				c.rbp() = (uint64_t) *((void**)c.rbp());
+		lookup that just returns the same.
+
+		So we're allowed to generate
+		  ... a walk_stack that is optimised for particular frames
+		  ... a stepper that encodes the CFA table (as a function from context to context)
+		          ... or just a row in the CFA table?
+				  ... or just a *cell*  in the CFA table? no, must
+		  ... a dispatcher that is optimised for particular frames and walkers (and may do caching).
+
+		Remember the inline caching idea: each function has its own stepper,
+		and may also have its own dispatcher who code is an inline-cached branch
+		to the next walker / stepper. The branch is guarded by tests on the next
+		rip -- that it is within the range expected. So the branch is specialised to
+		a particular caller. Could any other flavour of guard be efficient?
+		Maybe whether the caller is in the same object? i.e. instead of "blind"
+		walker which assumes the same walker/dispatcher for all parent frames,
+		a "DSO-homogeneous" walker which quickly checks that the caller is in the same DSO,
+		and avoids the indirect call if so? This is only useful if we *can* avoid the
+		indirect call, so for generic steppers rather than per-frame-compiled steppers.
+
+		Another avenue is to think of each row in the CFA table as a stepper.
+		The frame pointer stepper is simply a fixed row -- rsp is at CFA+16, ra at CFA+8,
+		rbp at CFA.
+		So maybe
+		dispatchers are per-DSO or at least coarser-grained things
+		steppers are simply rows in the table and look like the c.rsp() = c....() + ...; code above
+		an eh_frame unwinder could use a memtable-like lookup
+		... maybe coarse-grained by looking at bit prefixes?
+		     i.e. a longest-prefix-match scheme? YES but how to memtablify this?
+			        one memtable per prefix length?
+
+
+ */
+
+/* What would an efficient dispatcher look like?
+ * Space-wise, a typical eh_frame section is about 10% the size of a \.text section.
+ * Logically,
+ * what we want is a big jump table keyed on the program counter.
+ * Each entry in the table points to an unwind routine,
+ *      i.e. code which twiddles registers.
+ * Hypothesis: the routines themselves can be deduplicated aggressively,
+ *     because they're simple things like "*(rbp + 8)" or "rsp - 16" or whatever.
+ *     Even when backwalking many registers, they should dedup okay.
+ *
+ * How can we build the table itself efficiently?
+ * Say we scale the instruction pointer down by 16 bytes,
+ * and have two bytes in the table for each such window. That's a 12.5% space overhead.
+ * Each two bytes is an offset (perhaps re-up-scaled) into the unwind routine table.
+ * Two bytes should be enough if we really did aggressively deduplicate the routines...
+ * If the 16-byte window has ambiguity, we need to point into a more complex,
+ *   ambiguity-resolving routine... but hope that we don't need many of those.
+ * To unwind "live" regs, we callq table+((offset_table[$rip])*scale)
+ * ... though do we want a separate mcontext_t-based version?
+ *        NOTE that getcontext/setcontext *should* be non-trapping, user-level-only operations
+ */
+
+/* What would a fast "unwind compiler" for stack frames look like?
+
+ * It has to encode the fact that different registers may be known.
+
+ * So, for example, a frame pointer unwinder
+ * requires an initial RBP, RSP and RIP,
+ * and yields a next-frame RBP, RSP and RIP.
+ *
+ * We might want to use a bitmap of "knownness", but then
+ * we can't have the C++ template "more specific" rule
+ * do the right thing.
+ *
+ * Instead we can use something like the following.
+ */
+
+/* At run time, we want our stack walker to look like the following.
+ *
+ * - Snapshot initial machine state (say rsp, rip).
+ * - Invoke stepper, yielding higher-frame state.
+ * - Use rip to look up new stepper.
+ * - Invoke new stepper and repeat.
+ *
+ * - Can we have each stepper cache one or more "next steppers"?
+ *   YES. This could be an inline cache.
+ *   We want the cache-miss case to go out-of-line.
+ *   Then a frame-pointer unwinder does no lookup; it just says "use fp stepper!".
+ *   This should go as fast as our existing fake-libunwind stepper in liballocs.
+ *   We need to pass the lookup function as a template argument,
+ *   in order for this to get elaborated down
+
+ */
+
+typedef int cb_t(void *sp, void *ip, void *arg);
+
+
+/* This is a generic walker
+ * which implements a recursive walk with caching,
+ * and uses the template arguments
+ * to control
+ * cache size, step logic, step validity guard, and next-stepper dispatch.
+
+
+ * Are we caching in the right place?
+ * Our stack_walker fixes a StepRoutine
+ * but can delegate to a different stack walker, built from a different StepRoutine.
+ * So if using DWARF, we generate one stack_walker per function;
+ * each one has a validity check,
+ * a cache of zero or more "likely next walkers",
+ * a dispatcher for deciding the actual next walker.
+ *
+ * What's wrong with this:
+ * - we generate per-function *walk* code, not merely per-function step code
+ * - caching logically belongs with the dispatcher
+ * - the callback is dispatched at run time
+ * - the representations of machine state and validity are fixed at mcontext_t
+ *
+ * What we really want is
+ * - to generate per-function step-and-dispatch
+ * - i.e. part of the "context" that the stepper gives us back
+ *       is the *next* stepper
+ *             -- which it could have got from a (per-stepper, hence per-function) cache
+ *                 or just by "blind" selection, or
+ * */
+
+// template <typename Action, typename Cursor, typename Stepper>
+// struct stack_walker
+// {
+// 	int walk_stack(const Cursor &cursor = Cursor())
+// 	{
+// 		Action::act(cursor);
+// 		step_fn *next = Stepper::step(cursor);   // this might yield a *new* stepper,
+// 		           // but we want to allow statically fixing the stepper by inlining
+// 		walk_stack_from(cursor);
+// 	}
+// };
+
+struct mcursor
+{
+	mcontext_t state;
+	mcontext_t validity;
+	long long int& rip() { return state.gregs[REG_RIP]; }
+	long long int& rsp() { return state.gregs[REG_RSP]; }
+	long long int& rbp() { return state.gregs[REG_RBP]; }
+};
+
+template <unsigned CacheSize, typename Cursor, typename StepRoutine, typename ValidityCheck, typename Dispatcher, typename Action>
+struct caching_stack_walker
+{
+	static int walk_stack(Cursor& c, const Action& a = Action())
+	{
+		int (*my_addr)(Cursor& c, const Action&) = walk_stack;
+
+		/* Call the callback first. */
+		a.act(c);
+
+		/* Check we have the validity to run the step routine. */
+		if (!ValidityCheck::check(c)) abort();
+
+		int err = StepRoutine::step(a, c);
+		if (err) return 1;
+
+		/* Delegate: we check the cache first */
+		static struct
+		{
+			//std::function<int(Cursor&, const Action &)> walker;
+			int (*walker)(Cursor&, const Action &);
+			void *lower;
+			void *upper;
+		} cache[CacheSize];
+		static unsigned cache_mru;
+
+		for (unsigned i = 0; i < CacheSize; ++i)
+		{
+			if ((void*) c.rip() >= cache[i].lower && (void*) c.rip() < cache[i].upper
+				&& cache[i].walker)
+			{
+				cache_mru = i;
+				return cache[i].walker(c, a);
+			}
+		}
+
+		/* If the cache fails, call the Dispatcher to indirect-call the next stepper.
+		 * How does the validity factor in?
+		 * It's just a check: whatever stepper we look up, it'll have some
+		 * validity requirements, and it'll check them. However, since
+		 * the Dispatcher call *might* be inlined, we can propagate the validity
+		 * information through template arguments and eliminate the check
+		 * in the frame pointer case.
+		 */
+		void *lower;
+		void *upper;
+		//std::function<int(Cursor&, const Action &)> next_walker
+		int (*next_walker)(Cursor&, const Action &)
+		 = Dispatcher::lookup(a, c, &lower, &upper);
+
+		if (next_walker != my_addr) __builtin_unreachable();
+
+		cache[(cache_mru + 1) % CacheSize] = { next_walker, lower, upper };
+
+		return next_walker(c, a);
+	}
+};
+template <typename Cursor, typename StepRoutine, typename ValidityCheck, typename Dispatcher, typename Action>
+struct stack_walker
+{
+	static int walk_stack(Cursor& c, const Action& a = Action())
+	{
+		//int (*my_addr)(Cursor& c, const Action&) = walk_stack;
+
+		/* Call the callback first. */
+		a.act(c);
+
+		/* Check we have the validity to run the step routine. */
+		if (!ValidityCheck::check(c)) abort();
+
+		int err = StepRoutine::step(a, c);
+		if (err) return 1;
+
+		/* Call the Dispatcher to indirect-call the next stepper. */
+		void *lower;
+		void *upper;
+		//std::function<int(Cursor&, const Action &)> next_walker
+		int (*next_walker)(Cursor&, const Action &)
+		 = Dispatcher::lookup(a, c, &lower, &upper);
+
+		//if (next_walker != my_addr) __builtin_unreachable();
+
+		return next_walker(c, a);
+	}
+};
+
+/* Now we instantiate this with the stupidest-but-fastest stack walker
+ * around: the blind frame-pointer stepper. */
+
+struct blind_frame_pointer_validity_checker
+{
+	static bool check(mcursor& c) { return true; }
+};
+
+struct blind_frame_pointer_dispatcher; // see below
+
+struct blind_frame_pointer_stepper
+{
+	template <typename Action>
+	static int step(const Action& a, mcursor& c)
+	{
+		// does our bp look sane?
+		if ((uintptr_t) c.rbp() < (uintptr_t) c.rsp()
+		  || (uintptr_t) c.rbp() - (uintptr_t) c.rsp() > 0x10000000ul)
+		{
+			// looks dodgy
+			return -1;
+		}
+		// FIXME: if bp == sp, perhaps that means we should stop? or mark bp invalid?
+
+		// the next-higher ip is the return addr of the frame, i.e. 4(%eip)
+        // NOTE(Theo): 4(%rbp), right?
+		void *return_addr = *(((void**)c.rbp()) + 1);
+		void *sp = (((void**)c.rbp()) + 2);
+		void *bp = *((void**)c.rbp());
+
+		// error detection/handling -- communicate using return value
+		if ((uintptr_t) sp <= (uintptr_t) c.rsp()
+		  || (uintptr_t) sp - (uintptr_t) c.rsp() > 0x10000000ul)
+		{
+			// looks dodgy
+			return -1;
+		}
+
+		// don't check rbp here -- if it's bad, rsp and rip might still be good
+		c.rsp() = (uint64_t) sp;
+		c.rbp() = (uint64_t) bp;
+		c.rip() = (uint64_t) return_addr;
+		return 0;
+	}
+
+#define SANE_BP_OR_NULL(bp, sp) \
+	(((char*) (bp) > (char*) (sp) && ((char*) (bp) - (char*) (sp)) < 0x10000)  \
+		? (void*) (bp) \
+		: 0)
+
+#pragma GCC push_options
+#pragma GCC optimize ("no-omit-frame-pointer")
+	static int initcontext(mcursor& c)
+	__attribute__((noinline,visibility("protected")))
+	{
+		/* The initial state of the cursor should be such that
+		 * if the caller does
+		 *
+		 * getcontext(...)
+		 * then
+		 * state->rsp
+		 *
+		 * they get their own stack pointer. */
+		void *caller_sp_minus_two_words = __builtin_frame_address(0);
+		void *caller_bp;
+		void *caller_sp;
+		void *current_return_addr;
+
+		assert(caller_sp_minus_two_words != NULL);
+		current_return_addr = __builtin_return_address(0);
+		/* We get the old break pointer by dereferencing the addr found at 0(%rbp) */
+		caller_bp = *((void**)caller_sp_minus_two_words);
+		assert(caller_bp != 0);
+		/* We get the caller stack pointer by taking the addr, and adjusting for
+		 * the arguments & return addr to this function (two words). */
+		caller_sp = (((void**)caller_sp_minus_two_words) + 2);
+
+		c.rsp() = (uint64_t) caller_sp;
+		c.validity.gregs[REG_RSP] = 1;
+		c.rbp() = (uint64_t) SANE_BP_OR_NULL(caller_bp, caller_sp);
+		c.validity.gregs[REG_RBP] = (c.rbp()) ? 1 : 0;
+		c.rip() = (uint64_t) current_return_addr;
+		c.validity.gregs[REG_RIP] = 1;
+
+		return 0;
+	}
+#pragma GCC pop_options
+
+#undef SANE_BP_OR_NULL
+};
+
+struct blind_frame_pointer_dispatcher
+{
+	// typedef int (*walker_fn)(Cursor&, const Action&);
+	template <typename Action, typename Cursor>
+	static
+	//std::function<int(Cursor&, const Action&)>
+	int
+	(*lookup     (const Action& a, Cursor &c, void **out_lower, void **out_upper))
+	(Cursor&, const Action&)
+	{
+		*out_lower = nullptr;
+		*out_upper = nullptr;
+		return &caching_stack_walker<
+			1u /* CacheSize */,
+			Cursor,
+			blind_frame_pointer_stepper /* StepRoutine */,
+			blind_frame_pointer_validity_checker /* ValidityCheck */,
+			blind_frame_pointer_dispatcher /* Dispatcher */,
+			Action
+		>::walk_stack;
+	}
+};
+
+struct print_frame_action
+{
+	int act(mcursor& c) const
+	{
+		Dl_info i;
+		int success = dladdr((void *) c.rip(), &i);
+		printf("sp=%p %s (%p)\n", (void*) c.rsp(), (success && i.dli_sname) ? i.dli_sname : "(no symbol)", (void*) c.rip());
+		return 0;
+	}
+};
+struct do_nothing_action
+{
+	int act(mcursor& c) const
+	{
+		return 0;
+	}
+};
+
+int g(int x)
+{
+	//mcursor c;
+	//int ret = getcontext(&context); /* FIXME: want a stepper-specific "initial state" */
+	int ret;// = blind_frame_pointer_stepper::initcontext(c);
+	//if (ret) abort();
+	//caching_stack_walker<
+	//	1u /* CacheSize */,
+	//	mcursor,
+	//	blind_frame_pointer_stepper /* StepRoutine */,
+	//	blind_frame_pointer_validity_checker /* ValidityCheck */,
+	//	blind_frame_pointer_dispatcher /* Dispatcher */,
+	//	print_frame_action /* Action */
+	//>::walk_stack(c);
+
+	mcursor c_2;
+	ret = x;
+    ret = blind_frame_pointer_stepper::initcontext(c_2);
+	if (ret) abort();
+	return
+        stack_walker<
+            mcursor,
+            blind_frame_pointer_stepper /* StepRoutine */,
+            blind_frame_pointer_validity_checker /* ValidityCheck */,
+            blind_frame_pointer_dispatcher /* Dispatcher */,
+            print_frame_action /* Action */
+        >::walk_stack(c_2);
+}
+int do_useless_stuff(int rec_depth) {
+    if(rec_depth <= 0) {
+        g(rec_depth);
+        return 1;
+    }
+    int result = rec_depth * do_useless_stuff(rec_depth - 1);
+    int values[] = {0, 0, 0, result};
+    return values[3];
+}
+void f()
+{
+	if(do_useless_stuff(10) == 42) {
+        puts("This never happens, but gcc won't optimize then :>");
+    }
+}
+int main()
+{
+	/* Unwind some shizzle. */
+	f();
+
+	return 0;
+}