From be47fefd981ca6afa0f8aefc08ba2f38db4a86e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Thu, 2 Aug 2018 02:20:01 +0200
Subject: [PATCH] More DWARF details

---
 report/report.tex        | 62 ++++++++++++++++++++++++++++++++++++++--
 report/src/.gitignore    |  1 +
 report/src/fib7/Makefile |  4 +++
 report/src/fib7/fib7.c   | 17 +++++++++++
 report/src/fib7/fib7.fde |  5 ++++
 shared/common.sty        |  1 +
 shared/specific.sty      |  3 ++
 7 files changed, 91 insertions(+), 2 deletions(-)
 create mode 100644 report/src/.gitignore
 create mode 100644 report/src/fib7/Makefile
 create mode 100644 report/src/fib7/fib7.c
 create mode 100644 report/src/fib7/fib7.fde

diff --git a/report/report.tex b/report/report.tex
index 3d24b28..11c73d7 100644
--- a/report/report.tex
+++ b/report/report.tex
@@ -100,11 +100,69 @@ original programming language, correspondence of assembly instructions with a
 line in the original source file, \ldots
 The format also specifies a way to represent unwinding data, as described in
 the previous paragraph, in an ELF section originally called
-\lstc{.debug_frame}, most often found as \lstc{.eh_frame}.
+\lstc{.debug_frame}, most often found as \ehframe.
+
+For any binary, debugging information can easily get quite large if no
+attention is payed to keeping it as compact as possible. In this matter, DWARF
+does an excellent job, and everything is stored in a very compact way. This,
+however, as we will see, makes it both difficult to parse correctly (with \eg{}
+variable-length integers) and quite slow to interpret.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{DWARF unwinding data}
-\todo{}
+
+The unwinding data, which we will call from now on the \ehframe, contains, for
+each possible instruction pointer (that is, an instruction address within the
+program), a set of ``registers'' that can be unwound, and a rule describing how
+to do so.
+
+The DWARF language is completely agnostic of the platform and ABI, and in
+particular, is completely agnostic of a particular platform's registers. Thus,
+when talking about DWARF, a register is merely a numerical identifier that is
+often, but not necessarily, mapped to a real machine register by the ABI\@.
+
+In practice, this data takes the form of a collection of tables, one table per
+Frame Description Entry (FDE), which most often corresponds to a function. Each
+column of the table is a register (\eg{} \reg{rsp}), with two additional
+special registers, CFA (Canonical Frame Address) and RA (Return Address),
+containing respectively the base pointer of the current stack frame and the
+return address of the current function (\ie{} for x86\_64, the unwound value of
+\reg{rip}, the instruction pointer). Each row of the table is a particular
+instruction pointer, within the instruction pointer range of the tabulated FDE
+(assuming a FDE maps directly to a function, this range is simply the IP range
+of the given function in the \lstc{.text} section of the binary), a row being
+valid from its start IP to the start IP of the next row, or the end IP of the
+FDE if it is the last row.
+
+\begin{minipage}{0.45\textwidth}
+    \lstinputlisting[language=C, firstline=3, lastline=12]
+        {src/fib7/fib7.c}
+\end{minipage} \hfill \begin{minipage}{0.45\textwidth}
+    \lstinputlisting[language=C]{src/fib7/fib7.fde}
+\end{minipage}
+
+For instance, the C source code above, when compiled with \lstbash{gcc -O0
+-fomit-frame-pointer}, gives the table at its right. During the function
+prelude, \ie{} for $\mhex{675} \leq \reg{rip} < \mhex{679}$, the stack frame
+only contains the return address, thus the CFA is 8 bytes above \reg{rsp}
+(which was the value of \reg{rsp} before the call), and the return address is
+precisely at \reg{rsp}. Then, 9 integers of 8 bytes each (8 for \lstc{fibo},
+one for \lstc{pos}) are allocated on the stack, which puts the CFA 80 bytes
+above \reg{rsp}, and the return address still 8 bytes below the CFA\@. Then, by
+the end of the function, the local variables are discarded and \reg{rsp} is
+reset to its value from the first row.
+
+However, DWARF data isn't actually stored as a table in the binary files. The
+first row has the location of the first IP in the FDE, and must define at least
+its CFA\@. Then, when all relevant registers are defined, it is possible to
+define a new row by providing a location offset (\eg{} here $4$), and the new
+row is defined as a clone of the previous one, which can then be altered (\eg{}
+here by setting \lstc{CFA} to $\reg{rsp} + 80$). This means that every line is
+defined \wrt{} the previous one, and that the IPs of the successive rows cannot
+be determined before evaluating every row before. Thus, unwinding a frame from
+an IP close to the end of the frame will require evaluating pretty much every
+DWARF row in the table before reaching the relevant information, slowing down
+drastically the unwinding process.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{How big are FDEs?}
diff --git a/report/src/.gitignore b/report/src/.gitignore
new file mode 100644
index 0000000..a8a0dce
--- /dev/null
+++ b/report/src/.gitignore
@@ -0,0 +1 @@
+*.bin
diff --git a/report/src/fib7/Makefile b/report/src/fib7/Makefile
new file mode 100644
index 0000000..da5a342
--- /dev/null
+++ b/report/src/fib7/Makefile
@@ -0,0 +1,4 @@
+all: fib7.bin
+
+fib7.bin: fib7.c
+	gcc -O1 $< -o $@
diff --git a/report/src/fib7/fib7.c b/report/src/fib7/fib7.c
new file mode 100644
index 0000000..d01081d
--- /dev/null
+++ b/report/src/fib7/fib7.c
@@ -0,0 +1,17 @@
+#include <stdio.h>
+
+int fib7() {
+    int fibo[8];
+    fibo[0] = 1;
+    fibo[1] = 1;
+    for(int pos = 2; pos < 8; ++pos)
+        fibo[pos] =
+            fibo[pos - 1]
+            + fibo[pos - 2];
+    return fibo[7];
+}
+
+int main(void) {
+    printf("%d\n", fib7());
+    return 0;
+}
diff --git a/report/src/fib7/fib7.fde b/report/src/fib7/fib7.fde
new file mode 100644
index 0000000..ddae410
--- /dev/null
+++ b/report/src/fib7/fib7.fde
@@ -0,0 +1,5 @@
+[...] FDE [...] pc=675..6f3
+   LOC           CFA      ra
+0000000000000675 rsp+8    c-8
+0000000000000679 rsp+80   c-8
+00000000000006f2 rsp+8    c-8
diff --git a/shared/common.sty b/shared/common.sty
index a6b4c1c..0e5bd6d 100644
--- a/shared/common.sty
+++ b/shared/common.sty
@@ -2,6 +2,7 @@
 
 \newcommand{\ie}{\textit{ie.}}
 \newcommand{\eg}{\textit{eg.}}
+\newcommand{\wrt}{\textit{wrt.}}
 
 \newcommand{\set}[1]{\left\{ #1 \right\}}
 \newcommand{\card}[1]{\left\vert{} #1 \right\vert}
diff --git a/shared/specific.sty b/shared/specific.sty
index 5b1709d..380c85c 100644
--- a/shared/specific.sty
+++ b/shared/specific.sty
@@ -3,6 +3,9 @@
 \newcommand{\prog}[1]{\texttt{#1}}
 \newcommand{\ehelf}{\texttt{eh\_elf}}
 \newcommand{\ehelfs}{\texttt{eh\_elfs}}
+\newcommand{\ehframe}{\lstc{.eh_frame}}
+
+\newcommand{\mhex}[1]{0\texttt{x}#1}
 
 %% DWARF semantics
 \newcommand{\dwcfa}[1]{\texttt{DW\_CFA\_#1}}