From ae5c1f2adf4da04235d87d024d4d942c01b2b447 Mon Sep 17 00:00:00 2001 From: Lassi Tuura Date: Sun, 17 Apr 2011 20:33:09 -0700 Subject: [PATCH] Performance optimisations for fast trace. Insert static branch prediction predicates in useful places and avoid unnecessary code in the hottest paths. Bypass unnecessary indirect calls, in particular to access_mem(), when known to be safe. --- include/libunwind_i.h | 8 +++++ include/tdep-x86_64/libunwind_i.h | 8 ++--- src/mi/backtrace.c | 6 ++-- src/x86_64/Ginit.c | 15 ++-------- src/x86_64/Ginit_local.c | 2 +- src/x86_64/Gos-freebsd.c | 1 + src/x86_64/Gos-linux.c | 1 + src/x86_64/Gtrace.c | 49 +++++++++++++++---------------- src/x86_64/init.h | 48 +++++++++++++++++++----------- src/x86_64/unwind_i.h | 12 ++++++++ 10 files changed, 88 insertions(+), 62 deletions(-) diff --git a/include/libunwind_i.h b/include/libunwind_i.h index 9b91a12f..d7353d18 100644 --- a/include/libunwind_i.h +++ b/include/libunwind_i.h @@ -301,6 +301,14 @@ struct elf_image size_t size; /* (file-) size of the image */ }; +/* Provide a place holder for architecture to override for fast access + to memory when known not to need to validate and know the access + will be local to the process. A suitable override will improve + unw_tdep_trace() performance in particular. */ +#define ACCESS_MEM_FAST(ret,validate,cur,addr,to) \ + do { (ret) = dwarf_get ((cur), DWARF_MEM_LOC ((cur), (addr)), &(to)); } \ + while (0) + #include "tdep/libunwind_i.h" #ifndef tdep_get_func_addr diff --git a/include/tdep-x86_64/libunwind_i.h b/include/tdep-x86_64/libunwind_i.h index 22697b13..c4c69600 100644 --- a/include/tdep-x86_64/libunwind_i.h +++ b/include/tdep-x86_64/libunwind_i.h @@ -110,10 +110,10 @@ dwarf_get_uc(const struct dwarf_cursor *cursor) # define DWARF_LOC(r, t) ((dwarf_loc_t) { .val = (r) }) # define DWARF_IS_REG_LOC(l) 0 # define DWARF_REG_LOC(c,r) (DWARF_LOC((unw_word_t) \ - tdep_uc_addr(dwarf_get_uc(c), (r)), 0)) + x86_64_r_uc_addr(dwarf_get_uc(c), (r)), 0)) # define DWARF_MEM_LOC(c,m) DWARF_LOC ((m), 0) # define DWARF_FPREG_LOC(c,r) (DWARF_LOC((unw_word_t) \ - tdep_uc_addr(dwarf_get_uc(c), (r)), 0)) + x86_64_r_uc_addr(dwarf_get_uc(c), (r)), 0)) #else /* !UNW_LOCAL_ONLY */ # define DWARF_LOC_TYPE_FP (1 << 0) @@ -184,7 +184,6 @@ dwarf_put (struct dwarf_cursor *c, dwarf_loc_t loc, unw_word_t val) /* Platforms that support UNW_INFO_FORMAT_TABLE need to define tdep_search_unwind_table. */ #define tdep_search_unwind_table dwarf_search_unwind_table -#define tdep_uc_addr UNW_ARCH_OBJ(uc_addr) #define tdep_get_elf_image UNW_ARCH_OBJ(get_elf_image) #define tdep_access_reg UNW_OBJ(access_reg) #define tdep_access_fpreg UNW_OBJ(access_fpreg) @@ -199,6 +198,7 @@ dwarf_put (struct dwarf_cursor *c, dwarf_loc_t loc, unw_word_t val) #endif #define tdep_stash_frame UNW_OBJ(stash_frame) #define tdep_trace UNW_OBJ(tdep_trace) +#define x86_64_r_uc_addr UNW_OBJ(r_uc_addr) #ifdef UNW_LOCAL_ONLY # define tdep_find_proc_info(c,ip,n) \ @@ -226,7 +226,7 @@ extern void tdep_init_mem_validate (void); extern int tdep_search_unwind_table (unw_addr_space_t as, unw_word_t ip, unw_dyn_info_t *di, unw_proc_info_t *pi, int need_unwind_info, void *arg); -extern void *tdep_uc_addr (ucontext_t *uc, int reg); +extern void *x86_64_r_uc_addr (ucontext_t *uc, int reg); extern int tdep_get_elf_image (struct elf_image *ei, pid_t pid, unw_word_t ip, unsigned long *segbase, unsigned long *mapoff, char *path, size_t pathlen); diff --git a/src/mi/backtrace.c b/src/mi/backtrace.c index 42e1f5d3..bd748aac 100644 --- a/src/mi/backtrace.c +++ b/src/mi/backtrace.c @@ -39,7 +39,7 @@ slow_backtrace (void **buffer, int size, unw_context_t *uc) unw_word_t ip; int n = 0; - if (unw_init_local (&cursor, uc) < 0) + if (unlikely (unw_init_local (&cursor, uc) < 0)) return 0; while (unw_step (&cursor) > 0) @@ -63,10 +63,10 @@ unw_backtrace (void **buffer, int size) tdep_getcontext_trace (&uc); - if (unw_init_local (&cursor, &uc) < 0) + if (unlikely (unw_init_local (&cursor, &uc) < 0)) return 0; - if (tdep_trace (&cursor, buffer, &n) < 0) + if (unlikely (tdep_trace (&cursor, buffer, &n) < 0)) { unw_getcontext (&uc); return slow_backtrace (buffer, size, &uc); diff --git a/src/x86_64/Ginit.c b/src/x86_64/Ginit.c index f49e4bad..ee62d02e 100644 --- a/src/x86_64/Ginit.c +++ b/src/x86_64/Ginit.c @@ -47,16 +47,6 @@ static struct unw_addr_space local_addr_space; PROTECTED unw_addr_space_t unw_local_addr_space = &local_addr_space; -# ifdef UNW_LOCAL_ONLY - -HIDDEN void * -tdep_uc_addr (ucontext_t *uc, int reg) -{ - return x86_64_r_uc_addr (uc, reg); -} - -# endif /* UNW_LOCAL_ONLY */ - HIDDEN unw_dyn_info_list_t _U_dyn_info_list; /* XXX fix me: there is currently no way to locate the dyn-info list @@ -168,7 +158,7 @@ static int access_mem (unw_addr_space_t as, unw_word_t addr, unw_word_t *val, int write, void *arg) { - if (write) + if (unlikely (write)) { Debug (16, "mem[%016lx] <- %lx\n", addr, *val); *(unw_word_t *) addr = *val; @@ -177,7 +167,8 @@ access_mem (unw_addr_space_t as, unw_word_t addr, unw_word_t *val, int write, { /* validate address */ const struct cursor *c = (const struct cursor *)arg; - if (c && c->validate && validate_mem(addr)) + if (likely (c != 0) && unlikely (c->validate) + && unlikely (validate_mem (addr))) return -1; *val = *(unw_word_t *) addr; Debug (16, "mem[%016lx] -> %lx\n", addr, *val); diff --git a/src/x86_64/Ginit_local.c b/src/x86_64/Ginit_local.c index 70bef3e1..54b4fcdb 100644 --- a/src/x86_64/Ginit_local.c +++ b/src/x86_64/Ginit_local.c @@ -43,7 +43,7 @@ unw_init_local (unw_cursor_t *cursor, ucontext_t *uc) { struct cursor *c = (struct cursor *) cursor; - if (tdep_needs_initialization) + if (unlikely (tdep_needs_initialization)) tdep_init (); Debug (1, "(cursor=%p)\n", c); diff --git a/src/x86_64/Gos-freebsd.c b/src/x86_64/Gos-freebsd.c index 50ee60bd..3ef99261 100644 --- a/src/x86_64/Gos-freebsd.c +++ b/src/x86_64/Gos-freebsd.c @@ -154,6 +154,7 @@ unw_handle_signal_frame (unw_cursor_t *cursor) HIDDEN void * x86_64_r_uc_addr (ucontext_t *uc, int reg) { + /* NOTE: common_init() in init.h inlines these for fast path access. */ void *addr; switch (reg) diff --git a/src/x86_64/Gos-linux.c b/src/x86_64/Gos-linux.c index c0278881..a315ea1e 100644 --- a/src/x86_64/Gos-linux.c +++ b/src/x86_64/Gos-linux.c @@ -106,6 +106,7 @@ unw_handle_signal_frame (unw_cursor_t *cursor) HIDDEN void * x86_64_r_uc_addr (ucontext_t *uc, int reg) { + /* NOTE: common_init() in init.h inlines these for fast path access. */ void *addr; switch (reg) diff --git a/src/x86_64/Gtrace.c b/src/x86_64/Gtrace.c index 6935d00b..5b23f7c0 100644 --- a/src/x86_64/Gtrace.c +++ b/src/x86_64/Gtrace.c @@ -92,7 +92,7 @@ trace_cache_buckets (void) unw_tdep_frame_t *frames = mempool_alloc(&trace_frame_pool); size_t i; - if (likely (frames != 0)) + if (likely(frames != 0)) for (i = 0; i < (1u << HASH_LOW_BITS); ++i) frames[i] = empty_frame; @@ -142,7 +142,7 @@ trace_cache_expand (unw_trace_cache_t *cache) old_size = (1u << cache->log_frame_vecs); new_size = cache->log_frame_vecs + 2; for (i = old_size; i < (1u << new_size); ++i) - if (unlikely (! (cache->frames[i] = trace_cache_buckets()))) + if (unlikely(! (cache->frames[i] = trace_cache_buckets()))) { Debug(5, "failed to expand cache to 2^%lu hash bucket sets\n", new_size); for (j = old_size; j < i; ++j) @@ -237,10 +237,10 @@ trace_init_addr (unw_tdep_frame_t *f, d->loc[UNW_X86_64_RSP] = DWARF_REG_LOC (d, UNW_X86_64_RSP); c->frame_info = *f; - if (dwarf_put (d, d->loc[UNW_X86_64_RIP], rip) >= 0 - && dwarf_put (d, d->loc[UNW_X86_64_RBP], rbp) >= 0 - && dwarf_put (d, d->loc[UNW_X86_64_RSP], rsp) >= 0 - && (ret = unw_step (cursor)) >= 0) + if (likely(dwarf_put (d, d->loc[UNW_X86_64_RIP], rip) >= 0) + && likely(dwarf_put (d, d->loc[UNW_X86_64_RBP], rbp) >= 0) + && likely(dwarf_put (d, d->loc[UNW_X86_64_RSP], rsp) >= 0) + && likely((ret = unw_step (cursor)) >= 0)) *f = c->frame_info; /* If unw_step() stopped voluntarily, remember that, even if it @@ -290,14 +290,14 @@ trace_lookup (unw_cursor_t *cursor, addr = frame->virtual_address; /* Return if we found the address. */ - if (addr == rip) + if (likely(addr == rip)) { Debug (4, "found address after %ld steps\n", i); return frame; } /* If slot is empty, reuse it. */ - if (! addr) + if (likely(! addr)) break; /* Linear probe to next slot candidate, step = 1. */ @@ -310,9 +310,9 @@ trace_lookup (unw_cursor_t *cursor, it's free or collides. Note that hash expansion drops previous contents; further lookups will refill the hash. */ Debug (4, "updating slot %lu after %ld steps, replacing 0x%lx\n", slot, i, addr); - if (unlikely (addr || cache->used >= cache_size / 2)) + if (unlikely(addr || cache->used >= cache_size / 2)) { - if (unlikely (trace_cache_expand (cache) < 0)) + if (unlikely(trace_cache_expand (cache) < 0)) return 0; cache_size = 1u << (HASH_LOW_BITS + cache->log_frame_vecs); @@ -404,7 +404,7 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) int ret; /* Check input parametres. */ - if (! cursor || ! buffer || ! size || (maxdepth = *size) <= 0) + if (unlikely(! cursor || ! buffer || ! size || (maxdepth = *size) <= 0)) return -UNW_EINVAL; Debug (1, "begin ip 0x%lx cfa 0x%lx\n", d->ip, d->cfa); @@ -415,7 +415,7 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) /* Determine initial register values. */ rip = d->ip; rsp = cfa = d->cfa; - if ((ret = dwarf_get (d, d->loc[UNW_X86_64_RBP], &rbp)) < 0) + if (unlikely((ret = dwarf_get (d, d->loc[UNW_X86_64_RBP], &rbp)) < 0)) { Debug (1, "returning %d, rbp value not found\n", ret); *size = 0; @@ -424,7 +424,7 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) } /* Get frame cache. */ - if (! (cache = trace_cache_get())) + if (unlikely(! (cache = trace_cache_get()))) { Debug (1, "returning %d, cannot get trace cache\n", -UNW_ENOMEM); *size = 0; @@ -450,7 +450,7 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) unw_tdep_frame_t *f = trace_lookup (cursor, cache, cfa, rip, rbp, rsp); /* If we don't have information for this frame, give up. */ - if (! f) + if (unlikely(! f)) { ret = -UNW_ENOINFO; break; @@ -481,9 +481,9 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) case UNW_X86_64_FRAME_STANDARD: /* Advance standard traceable frame. */ cfa = (f->cfa_reg_rsp ? rsp : rbp) + f->cfa_reg_offset; - ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa - 8), &rip); - if (ret >= 0 && f->rbp_cfa_offset != -1) - ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rbp_cfa_offset), &rbp); + ACCESS_MEM_FAST(ret, c->validate, d, cfa - 8, rip); + if (likely(ret >= 0) && likely(f->rbp_cfa_offset != -1)) + ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rbp_cfa_offset, rbp); /* Don't bother reading RSP from DWARF, CFA becomes new RSP. */ rsp = cfa; @@ -497,13 +497,12 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) registers (ucontext) among other things. We know the info is stored at some unknown constant offset off inner frame's CFA. We determine the actual offset from DWARF unwind info. */ - d->use_prev_instr = 0; cfa = cfa + f->cfa_reg_offset; - ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rbp_cfa_offset + dRIP), &rip); - if (ret >= 0) - ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rbp_cfa_offset), &rbp); - if (ret >= 0) - ret = dwarf_get (d, DWARF_MEM_LOC (d, cfa + f->rsp_cfa_offset), &rsp); + ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rbp_cfa_offset + dRIP, rip); + if (likely(ret >= 0)) + ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rbp_cfa_offset, rbp); + if (likely(ret >= 0)) + ACCESS_MEM_FAST(ret, c->validate, d, cfa + f->rsp_cfa_offset, rsp); /* Resume stack at signal restoration point. The stack is not necessarily continuous here, especially with sigaltstack(). */ @@ -524,8 +523,8 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) Debug (4, "new cfa 0x%lx rip 0x%lx rsp 0x%lx rbp 0x%lx\n", cfa, rip, rsp, rbp); - /* If we failed on ended up somewhere bogus, stop. */ - if (ret < 0 || rip < 0x4000) + /* If we failed or ended up somewhere bogus, stop. */ + if (unlikely(ret < 0 || rip < 0x4000)) break; /* Record this address in stack trace. We skipped the first address. */ diff --git a/src/x86_64/init.h b/src/x86_64/init.h index f04ecda3..e80e5533 100644 --- a/src/x86_64/init.h +++ b/src/x86_64/init.h @@ -27,28 +27,42 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "unwind_i.h" +/* Avoid a trip to x86_64_r_uc_addr() for purely local initialisation. */ +#if defined UNW_LOCAL_ONLY && defined __linux +# define REG_INIT_LOC(c, rlc, ruc) \ + DWARF_LOC ((unw_word_t) &c->uc->uc_mcontext.gregs[REG_ ## ruc], 0) + +#elif defined UNW_LOCAL_ONLY && defined __FreeBSD__ +# define REG_INIT_LOC(c, rlc, ruc) \ + DWARF_LOC ((unw_word_t) &c->uc->uc_mcontext.mc_ ## rlc, 0) + +#else +# define REG_INIT_LOC(c, rlc, ruc) \ + DWARF_REG_LOC (&c->dwarf, UNW_X86_64_ ## ruc) +#endif + static inline int common_init (struct cursor *c, unsigned use_prev_instr) { int ret; - c->dwarf.loc[RAX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RAX); - c->dwarf.loc[RDX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RDX); - c->dwarf.loc[RCX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RCX); - c->dwarf.loc[RBX] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RBX); - c->dwarf.loc[RSI] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RSI); - c->dwarf.loc[RDI] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RDI); - c->dwarf.loc[RBP] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RBP); - c->dwarf.loc[RSP] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RSP); - c->dwarf.loc[R8] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R8); - c->dwarf.loc[R9] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R9); - c->dwarf.loc[R10] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R10); - c->dwarf.loc[R11] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R11); - c->dwarf.loc[R12] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R12); - c->dwarf.loc[R13] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R13); - c->dwarf.loc[R14] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R14); - c->dwarf.loc[R15] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_R15); - c->dwarf.loc[RIP] = DWARF_REG_LOC (&c->dwarf, UNW_X86_64_RIP); + c->dwarf.loc[RAX] = REG_INIT_LOC(c, rax, RAX); + c->dwarf.loc[RDX] = REG_INIT_LOC(c, rdx, RDX); + c->dwarf.loc[RCX] = REG_INIT_LOC(c, rcx, RCX); + c->dwarf.loc[RBX] = REG_INIT_LOC(c, rbx, RBX); + c->dwarf.loc[RSI] = REG_INIT_LOC(c, rsi, RSI); + c->dwarf.loc[RDI] = REG_INIT_LOC(c, rdi, RDI); + c->dwarf.loc[RBP] = REG_INIT_LOC(c, rbp, RBP); + c->dwarf.loc[RSP] = REG_INIT_LOC(c, rsp, RSP); + c->dwarf.loc[R8] = REG_INIT_LOC(c, r8, R8); + c->dwarf.loc[R9] = REG_INIT_LOC(c, r9, R9); + c->dwarf.loc[R10] = REG_INIT_LOC(c, r10, R10); + c->dwarf.loc[R11] = REG_INIT_LOC(c, r11, R11); + c->dwarf.loc[R12] = REG_INIT_LOC(c, r12, R12); + c->dwarf.loc[R13] = REG_INIT_LOC(c, r13, R13); + c->dwarf.loc[R14] = REG_INIT_LOC(c, r14, R14); + c->dwarf.loc[R15] = REG_INIT_LOC(c, r15, R15); + c->dwarf.loc[RIP] = REG_INIT_LOC(c, rip, RIP); ret = dwarf_get (&c->dwarf, c->dwarf.loc[RIP], &c->dwarf.ip); if (ret < 0) diff --git a/src/x86_64/unwind_i.h b/src/x86_64/unwind_i.h index 699a6b38..1e55a766 100644 --- a/src/x86_64/unwind_i.h +++ b/src/x86_64/unwind_i.h @@ -65,6 +65,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #define x86_64_r_uc_addr UNW_OBJ(r_uc_addr) #define x86_64_sigreturn UNW_OBJ(sigreturn) +/* By-pass calls to access_mem() when known to be safe. */ +#ifdef UNW_LOCAL_ONLY +# undef ACCESS_MEM_FAST +# define ACCESS_MEM_FAST(ret,validate,cur,addr,to) \ + do { \ + if (unlikely(validate)) \ + (ret) = dwarf_get ((cur), DWARF_MEM_LOC ((cur), (addr)), &(to)); \ + else \ + (ret) = 0, (to) = *(unw_word_t *)(addr); \ + } while (0) +#endif + extern void x86_64_local_addr_space_init (void); extern int x86_64_local_resume (unw_addr_space_t as, unw_cursor_t *cursor, void *arg);