From be47fefd981ca6afa0f8aefc08ba2f38db4a86e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Thu, 2 Aug 2018 02:20:01 +0200 Subject: [PATCH] More DWARF details --- report/report.tex | 62 ++++++++++++++++++++++++++++++++++++++-- report/src/.gitignore | 1 + report/src/fib7/Makefile | 4 +++ report/src/fib7/fib7.c | 17 +++++++++++ report/src/fib7/fib7.fde | 5 ++++ shared/common.sty | 1 + shared/specific.sty | 3 ++ 7 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 report/src/.gitignore create mode 100644 report/src/fib7/Makefile create mode 100644 report/src/fib7/fib7.c create mode 100644 report/src/fib7/fib7.fde diff --git a/report/report.tex b/report/report.tex index 3d24b28..11c73d7 100644 --- a/report/report.tex +++ b/report/report.tex @@ -100,11 +100,69 @@ original programming language, correspondence of assembly instructions with a line in the original source file, \ldots The format also specifies a way to represent unwinding data, as described in the previous paragraph, in an ELF section originally called -\lstc{.debug_frame}, most often found as \lstc{.eh_frame}. +\lstc{.debug_frame}, most often found as \ehframe. + +For any binary, debugging information can easily get quite large if no +attention is payed to keeping it as compact as possible. In this matter, DWARF +does an excellent job, and everything is stored in a very compact way. This, +however, as we will see, makes it both difficult to parse correctly (with \eg{} +variable-length integers) and quite slow to interpret. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{DWARF unwinding data} -\todo{} + +The unwinding data, which we will call from now on the \ehframe, contains, for +each possible instruction pointer (that is, an instruction address within the +program), a set of ``registers'' that can be unwound, and a rule describing how +to do so. + +The DWARF language is completely agnostic of the platform and ABI, and in +particular, is completely agnostic of a particular platform's registers. Thus, +when talking about DWARF, a register is merely a numerical identifier that is +often, but not necessarily, mapped to a real machine register by the ABI\@. + +In practice, this data takes the form of a collection of tables, one table per +Frame Description Entry (FDE), which most often corresponds to a function. Each +column of the table is a register (\eg{} \reg{rsp}), with two additional +special registers, CFA (Canonical Frame Address) and RA (Return Address), +containing respectively the base pointer of the current stack frame and the +return address of the current function (\ie{} for x86\_64, the unwound value of +\reg{rip}, the instruction pointer). Each row of the table is a particular +instruction pointer, within the instruction pointer range of the tabulated FDE +(assuming a FDE maps directly to a function, this range is simply the IP range +of the given function in the \lstc{.text} section of the binary), a row being +valid from its start IP to the start IP of the next row, or the end IP of the +FDE if it is the last row. + +\begin{minipage}{0.45\textwidth} + \lstinputlisting[language=C, firstline=3, lastline=12] + {src/fib7/fib7.c} +\end{minipage} \hfill \begin{minipage}{0.45\textwidth} + \lstinputlisting[language=C]{src/fib7/fib7.fde} +\end{minipage} + +For instance, the C source code above, when compiled with \lstbash{gcc -O0 +-fomit-frame-pointer}, gives the table at its right. During the function +prelude, \ie{} for $\mhex{675} \leq \reg{rip} < \mhex{679}$, the stack frame +only contains the return address, thus the CFA is 8 bytes above \reg{rsp} +(which was the value of \reg{rsp} before the call), and the return address is +precisely at \reg{rsp}. Then, 9 integers of 8 bytes each (8 for \lstc{fibo}, +one for \lstc{pos}) are allocated on the stack, which puts the CFA 80 bytes +above \reg{rsp}, and the return address still 8 bytes below the CFA\@. Then, by +the end of the function, the local variables are discarded and \reg{rsp} is +reset to its value from the first row. + +However, DWARF data isn't actually stored as a table in the binary files. The +first row has the location of the first IP in the FDE, and must define at least +its CFA\@. Then, when all relevant registers are defined, it is possible to +define a new row by providing a location offset (\eg{} here $4$), and the new +row is defined as a clone of the previous one, which can then be altered (\eg{} +here by setting \lstc{CFA} to $\reg{rsp} + 80$). This means that every line is +defined \wrt{} the previous one, and that the IPs of the successive rows cannot +be determined before evaluating every row before. Thus, unwinding a frame from +an IP close to the end of the frame will require evaluating pretty much every +DWARF row in the table before reaching the relevant information, slowing down +drastically the unwinding process. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{How big are FDEs?} diff --git a/report/src/.gitignore b/report/src/.gitignore new file mode 100644 index 0000000..a8a0dce --- /dev/null +++ b/report/src/.gitignore @@ -0,0 +1 @@ +*.bin diff --git a/report/src/fib7/Makefile b/report/src/fib7/Makefile new file mode 100644 index 0000000..da5a342 --- /dev/null +++ b/report/src/fib7/Makefile @@ -0,0 +1,4 @@ +all: fib7.bin + +fib7.bin: fib7.c + gcc -O1 $< -o $@ diff --git a/report/src/fib7/fib7.c b/report/src/fib7/fib7.c new file mode 100644 index 0000000..d01081d --- /dev/null +++ b/report/src/fib7/fib7.c @@ -0,0 +1,17 @@ +#include + +int fib7() { + int fibo[8]; + fibo[0] = 1; + fibo[1] = 1; + for(int pos = 2; pos < 8; ++pos) + fibo[pos] = + fibo[pos - 1] + + fibo[pos - 2]; + return fibo[7]; +} + +int main(void) { + printf("%d\n", fib7()); + return 0; +} diff --git a/report/src/fib7/fib7.fde b/report/src/fib7/fib7.fde new file mode 100644 index 0000000..ddae410 --- /dev/null +++ b/report/src/fib7/fib7.fde @@ -0,0 +1,5 @@ +[...] FDE [...] pc=675..6f3 + LOC CFA ra +0000000000000675 rsp+8 c-8 +0000000000000679 rsp+80 c-8 +00000000000006f2 rsp+8 c-8 diff --git a/shared/common.sty b/shared/common.sty index a6b4c1c..0e5bd6d 100644 --- a/shared/common.sty +++ b/shared/common.sty @@ -2,6 +2,7 @@ \newcommand{\ie}{\textit{ie.}} \newcommand{\eg}{\textit{eg.}} +\newcommand{\wrt}{\textit{wrt.}} \newcommand{\set}[1]{\left\{ #1 \right\}} \newcommand{\card}[1]{\left\vert{} #1 \right\vert} diff --git a/shared/specific.sty b/shared/specific.sty index 5b1709d..380c85c 100644 --- a/shared/specific.sty +++ b/shared/specific.sty @@ -3,6 +3,9 @@ \newcommand{\prog}[1]{\texttt{#1}} \newcommand{\ehelf}{\texttt{eh\_elf}} \newcommand{\ehelfs}{\texttt{eh\_elfs}} +\newcommand{\ehframe}{\lstc{.eh_frame}} + +\newcommand{\mhex}[1]{0\texttt{x}#1} %% DWARF semantics \newcommand{\dwcfa}[1]{\texttt{DW\_CFA\_#1}}