From f7fe1c9a7eb30b8be0da37d2ed4082a8e0601ba3 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 28 Nov 2016 15:50:58 -0800 Subject: [PATCH] x86_64: Add stack alignment prologue tdep_trace fastpath GCC versions 4.9~current will often generate stack alignment prologues like: lea 0x8(%rsp),%r10 and $0xfffffffffffffff0,%rsp ... push %rbp mov %rsp, %rbp push %r10 resulting in dwarf expressions: DW_CFA_def_cfa_expression (DW_OP_breg6: -8; DW_OP_deref) DW_CFA_expression: r6 (rbp) (DW_OP_breg6: 0) These prologues seem to be generated for SSE/AVX code, but sometimes other times as well. tdep_trace fastpath currently falls back to the slow dwarf parsing path if it encounters any cfa_expressions. Unfortunately this is happening often enough in our codebase to cause perf issues. We could also fix the fallback path (make the rs cache bigger, lock-free instead of locking, etc), but that seems like a separate issue, and it will ever be as fast as the tracing code. Our binaries each have at least ~100 functions in them like this. This patch teaches the tdep_trace about the two specific cfa_expressions, which really just result in a single extra memory dereference of the stack at a fixed offset from rbp. --- include/dwarf.h | 5 ++++ include/tdep-x86_64/libunwind_i.h | 5 ++-- src/dwarf/Gexpr.c | 48 +++++++++++++++++++++++++++++++ src/x86_64/Gstash_frame.c | 23 ++++++++++++++- src/x86_64/Gtrace.c | 18 ++++++++++++ 5 files changed, 96 insertions(+), 3 deletions(-) diff --git a/include/dwarf.h b/include/dwarf.h index 633868b8..f493de85 100644 --- a/include/dwarf.h +++ b/include/dwarf.h @@ -387,6 +387,7 @@ struct dwarf_callback_data #define dwarf_put_unwind_info UNW_OBJ (dwarf_put_unwind_info) #define dwarf_put_unwind_info UNW_OBJ (dwarf_put_unwind_info) #define dwarf_eval_expr UNW_OBJ (dwarf_eval_expr) +#define dwarf_stack_aligned UNW_OBJ (dwarf_stack_aligned) #define dwarf_extract_proc_info_from_fde \ UNW_OBJ (dwarf_extract_proc_info_from_fde) #define dwarf_find_save_locs UNW_OBJ (dwarf_find_save_locs) @@ -419,6 +420,10 @@ extern void dwarf_put_unwind_info (unw_addr_space_t as, extern int dwarf_eval_expr (struct dwarf_cursor *c, unw_word_t *addr, unw_word_t len, unw_word_t *valp, int *is_register); +extern int +dwarf_stack_aligned(struct dwarf_cursor *c, unw_word_t cfa_addr, + unw_word_t rbp_addr, unw_word_t *offset); + extern int dwarf_extract_proc_info_from_fde (unw_addr_space_t as, unw_accessors_t *a, unw_word_t *fde_addr, diff --git a/include/tdep-x86_64/libunwind_i.h b/include/tdep-x86_64/libunwind_i.h index d19c7054..e1271c1f 100644 --- a/include/tdep-x86_64/libunwind_i.h +++ b/include/tdep-x86_64/libunwind_i.h @@ -40,6 +40,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ typedef enum { + UNW_X86_64_FRAME_ALIGNED = -3, /* frame stack pointer aligned */ UNW_X86_64_FRAME_STANDARD = -2, /* regular rbp, rsp +/- offset */ UNW_X86_64_FRAME_SIGRETURN = -1, /* special sigreturn frame */ UNW_X86_64_FRAME_OTHER = 0, /* not cacheable (special or unrecognised) */ @@ -50,10 +51,10 @@ unw_tdep_frame_type_t; typedef struct { uint64_t virtual_address; - int64_t frame_type : 2; /* unw_tdep_frame_type_t classification */ + int64_t frame_type : 3; /* unw_tdep_frame_type_t classification */ int64_t last_frame : 1; /* non-zero if last frame in chain */ int64_t cfa_reg_rsp : 1; /* cfa dwarf base register is rsp vs. rbp */ - int64_t cfa_reg_offset : 30; /* cfa is at this offset from base register value */ + int64_t cfa_reg_offset : 29; /* cfa is at this offset from base register value */ int64_t rbp_cfa_offset : 15; /* rbp saved at this offset from cfa (-1 = not saved) */ int64_t rsp_cfa_offset : 15; /* rsp saved at this offset from cfa (-1 = not saved) */ } diff --git a/src/dwarf/Gexpr.c b/src/dwarf/Gexpr.c index b56bb317..1d4974e2 100644 --- a/src/dwarf/Gexpr.c +++ b/src/dwarf/Gexpr.c @@ -186,6 +186,54 @@ read_operand (unw_addr_space_t as, unw_accessors_t *a, return ret; } +HIDDEN int +dwarf_stack_aligned(struct dwarf_cursor *c, unw_word_t cfa_addr, + unw_word_t rbp_addr, unw_word_t *cfa_offset) { + unw_accessors_t *a; + int ret; + void *arg; + unw_word_t len; + uint8_t opcode; + unw_word_t operand1; + + a = unw_get_accessors (c->as); + arg = c->as_arg; + + ret = dwarf_read_uleb128(c->as, a, &rbp_addr, &len, arg); + if (len != 2 || ret < 0) + return 0; + + ret = dwarf_readu8(c->as, a, &rbp_addr, &opcode, arg); + if (ret < 0 || opcode != DW_OP_breg6) + return 0; + + ret = read_operand(c->as, a, &rbp_addr, + OPND1_TYPE(operands[opcode]), &operand1, arg); + + if (ret < 0 || operand1 != 0) + return 0; + + ret = dwarf_read_uleb128(c->as, a, &cfa_addr, &len, arg); + if (ret < 0 || len != 3) + return 0; + + ret = dwarf_readu8(c->as, a, &cfa_addr, &opcode, arg); + if (ret < 0 || opcode != DW_OP_breg6) + return 0; + + ret = read_operand(c->as, a, &cfa_addr, + OPND1_TYPE(operands[opcode]), &operand1, arg); + if (ret < 0) + return 0; + + ret = dwarf_readu8(c->as, a, &cfa_addr, &opcode, arg); + if (ret < 0 || opcode != DW_OP_deref) + return 0; + + *cfa_offset = operand1; + return 1; +} + HIDDEN int dwarf_eval_expr (struct dwarf_cursor *c, unw_word_t *addr, unw_word_t len, unw_word_t *valp, int *is_register) diff --git a/src/x86_64/Gstash_frame.c b/src/x86_64/Gstash_frame.c index dc6c7c87..451b9fe3 100644 --- a/src/x86_64/Gstash_frame.c +++ b/src/x86_64/Gstash_frame.c @@ -41,6 +41,23 @@ tdep_stash_frame (struct dwarf_cursor *d, struct dwarf_reg_state *rs) rs->reg[RBP].where, rs->reg[RBP].val, DWARF_GET_LOC(d->loc[RBP]), rs->reg[RSP].where, rs->reg[RSP].val, DWARF_GET_LOC(d->loc[RSP])); + if (rs->reg[DWARF_CFA_REG_COLUMN].where == DWARF_WHERE_EXPR && + rs->reg[RBP].where == DWARF_WHERE_EXPR) { + /* Check for GCC generated alignment frame for rsp. A simple + * def_cfa_expr that loads a constant offset from rbp, where the + * addres of the rip was pushed on the stack */ + unw_word_t cfa_addr = rs->reg[DWARF_CFA_REG_COLUMN].val; + unw_word_t rbp_addr = rs->reg[RBP].val; + unw_word_t cfa_offset; + + int ret = dwarf_stack_aligned(d, cfa_addr, rbp_addr, &cfa_offset); + if (ret) { + f->frame_type = UNW_X86_64_FRAME_ALIGNED; + f->cfa_reg_offset = cfa_offset; + f->cfa_reg_rsp = 0; + } + } + /* A standard frame is defined as: - CFA is register-relative offset off RBP or RSP; - Return address is saved at CFA-8; @@ -50,7 +67,7 @@ tdep_stash_frame (struct dwarf_cursor *d, struct dwarf_reg_state *rs) && (rs->reg[DWARF_CFA_REG_COLUMN].where == DWARF_WHERE_REG) && (rs->reg[DWARF_CFA_REG_COLUMN].val == RBP || rs->reg[DWARF_CFA_REG_COLUMN].val == RSP) - && labs((long) rs->reg[DWARF_CFA_OFF_COLUMN].val) < (1 << 29) + && labs((long) rs->reg[DWARF_CFA_OFF_COLUMN].val) < (1 << 28) && DWARF_GET_LOC(d->loc[d->ret_addr_column]) == d->cfa-8 && (rs->reg[RBP].where == DWARF_WHERE_UNDEF || rs->reg[RBP].where == DWARF_WHERE_SAME @@ -92,6 +109,10 @@ tdep_stash_frame (struct dwarf_cursor *d, struct dwarf_reg_state *rs) Debug (4, " sigreturn frame\n"); } + else if (f->frame_type == UNW_X86_64_FRAME_ALIGNED) { + Debug (4, " aligned frame, offset %li\n", f->cfa_reg_offset); + } + /* PLT and guessed RBP-walked frames are handled in unw_step(). */ else Debug (4, " unusual frame\n"); diff --git a/src/x86_64/Gtrace.c b/src/x86_64/Gtrace.c index 833d7a78..74122710 100644 --- a/src/x86_64/Gtrace.c +++ b/src/x86_64/Gtrace.c @@ -506,6 +506,24 @@ tdep_trace (unw_cursor_t *cursor, void **buffer, int *size) d->use_prev_instr = 0; break; + case UNW_X86_64_FRAME_ALIGNED: + /* Address of RIP was pushed on the stack via a simple + * def_cfa_expr - result stack offset stored in cfa_reg_offset */ + cfa = (f->cfa_reg_rsp ? rsp : rbp) + f->cfa_reg_offset; + ACCESS_MEM_FAST(ret, c->validate, d, cfa, cfa); + if (likely(ret >= 0)) + ACCESS_MEM_FAST(ret, c->validate, d, cfa - 8, rip); + if (likely(ret >= 0)) + ACCESS_MEM_FAST(ret, c->validate, d, rbp, rbp); + + /* Don't bother reading RSP from DWARF, CFA becomes new RSP. */ + rsp = cfa; + + /* Next frame needs to back up for unwind info lookup. */ + d->use_prev_instr = 1; + + break; + default: /* We cannot trace through this frame, give up and tell the caller we had to stop. Data collected so far may still be