Enhance all the writing

2018-07-18 23:46:43 +02:00 · 2018-07-18 23:46:43 +02:00 · cd0e521210
commit cd0e521210
parent ad2ff8d387
4 changed files with 160 additions and 66 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 all: semantics.pdf

-%.pdf: %.tex
+%.pdf: %.tex %.bib
 	latexmk -pdf $<

 upload: semantics.pdf
--- a/semantics.bib
+++ b/semantics.bib
@ -0,0 +1,13 @@
+@manual{dwarf5std,
+    title           = {DWARF Debugging Information Format version 5},
+    organization    = {DWARF Debugging Information Format Committee},
+    author          = {DWARF5},
+    url             = {http://dwarfstd.org},
+    year            = {2017},
+}
+
+@manual{c11std,
+    title           = {ISO/IEC 9899:2011},
+    organization    = {International Organization for Standardization},
+    author          = {C11},
+}
--- a/semantics.tex
+++ b/semantics.tex
@ -10,10 +10,13 @@
 \usepackage{stmaryrd}
 \usepackage{mathtools}
 \usepackage[utf8]{inputenc}
+\usepackage[backend=biber,style=alphabetic]{biblatex}

 \usepackage{my_listings}
 \usepackage{my_hyperref}

+\addbibresource{semantics.bib}
+
 \newcommand{\dwcfa}[1]{\texttt{DW\_CFA\_#1}}
 \newcommand{\reg}[1]{\%#1}

@ -46,84 +49,169 @@
 \begin{document}
 \maketitle

-\section{DWARF CFI instructions}
+We will now define semantics covering most of the operations used for
+CFI\footnote{To be defined elsewhere in the report} described in the DWARF
+standard~\cite{dwarf5std}, with the exception of DWARF expressions. These are
+not exhaustively treated because they are quite rich and would take a lot of
+time and space to formalize, and in the meantime are only seldom used (see the
+DWARF statistics regarding this).

-These instructions are up to variants (most instructions exist in multiple
-formats to handle various operands formatting, to optimize space).
+These semantics are defined with respect to the well-formalized C language, and
+are passing through an intermediary language. The DWARF language can read the
+whole memory, as well as registers, and is always executed for some instruction
+pointer. The C function representing it will thus take as parameters an array
+of the registers' values as well as an IP, and will return another array of
+registers values, which will represent the evaluated DWARF row.
+
+\section{Original language~: DWARF instructions}
+
+These are the DWARF instructions used for CFI description, that is, the
+instructions that contain the stack unwinding table informations. The following
+list is an exhaustive list of instructions from the DWARF5
+specification~\cite{dwarf5std} concerning CFI, with reworded descriptions for
+brevity and clarity. All these instructions are up to variants (most
+instructions exist in multiple formats to handle various operands formatting,
+to optimize space). Since we won't be talking about the underlying file format
+here, those variations between eg. \dwcfa{advance\_loc1} and
+\dwcfa{advance\_loc2} ---~which differ only on the number of bytes of their
+operand~--- are irrelevant and will be eluded.

 \begin{itemize}
    \item{} \dwcfa{set\_loc(loc)}~:
-        new DWARF row at address $loc$
+        start a new table row from address $loc$
    \item{} \dwcfa{advance\_loc(delta)}~:
-        new DWARF row at address $prev\_loc + delta$
+        start a new table row at address $prev\_loc + delta$
    \item{} \dwcfa{def\_cfa(reg, offset)}~:
-        sets CFA at $(\reg{reg} + offset)$
+        sets this row's CFA at $(\reg{reg} + \textit{offset})$
    \item{} \dwcfa{def\_cfa\_register(reg)}~:
-        sets CFA at $(\reg{reg} + prev\_offset)$
+        sets CFA at $(\reg{reg} + \textit{prev\_offset})$
    \item{} \dwcfa{def\_cfa\_offset(offset)}~:
-        sets CFA at $(\reg{prev\_reg} + offset)$
+        sets CFA at $(\reg{prev\_reg} + \textit{offset})$
    \item{} \dwcfa{def\_cfa\_expression(expr)}~:
-        defines CFA as the result of $expr$
+        sets CFA as the result of $expr$
    \item{} \dwcfa{undefined(reg)}~:
-        sets \reg{reg} undefined in the current (last) row
+        sets the register \reg{reg} as undefined in this row
    \item{} \dwcfa{same\_value(reg)}~:
-        sets \reg{reg} in the current (last) row as it was in the previous row
+        declares that the register \reg{reg} hasn't been touched, or was
+        restored to its previous value, in this row. An unwinding procedure can
+        leave it as-is.
    \item{} \dwcfa{offset(reg, offset)}~:
-        \reg{reg} is stored in memory at the address $CFA + offset$ in the
-        current (last) row
+        the value of the register \reg{reg} is stored in memory at the address
+        $CFA + \textit{offset}$.
    \item{} \dwcfa{val\_offset(reg, offset)}~:
-        \reg{reg} is the value $CFA + offset$ in the current (last) row
+        the value of the register \reg{reg} is the value $CFA + \textit{offset}$
    \item{} \dwcfa{register(reg, model)}~:
-        \reg{reg} has in the current (last) row the value that had
-        $\reg{model}$ in the previous row
+        the register \reg{reg} has, in this row, the value that $\reg{model}$
+        had in the previous row
    \item{} \dwcfa{expression(reg, expr)}~:
-        \reg{reg} is stored in memory at the address defined by $expr$
+        the value of \reg{reg} is stored in memory at the address defined by
+        $expr$
    \item{} \dwcfa{val\_expression(reg, expr)}~:
        \reg{reg} has the value of $expr$
    \item{} \dwcfa{restore(reg)}~:
-        \reg{reg} has the same value as in this FDE's preamble (CIE) in the
-        current (last) row. This is \emph{not implemented in this semantics}
-        for simplicity and brevity (we would have to introduce CIE (preamble)
-        and FDE (body) independently).
+        \reg{reg} has the same value as in this FDE's preamble (CIE) in this
+        row. This is \emph{not implemented in this semantics} for simplicity
+        and brevity (we would have to introduce CIE (preamble) and FDE (body)
+        independently). This is also not much used in actual ELF
+        files\footnote{TODO: refer to stats}.
    \item{} \dwcfa{remember\_state()}~:
-        push the state of all the registers of the current (last) row in an
-        implicit stack
+        push the state of all the registers of this row on an implicit stack
    \item{} \dwcfa{restore\_state()}~:
-        pop an entry of the implicit stack, and set all registers in the
-        current (last) row accordingly.
+        pop an entry of the implicit stack, and restore all registers in this
+        row to the value held in the stack record.
    \item{} \dwcfa{nop()}~:
        do nothing (padding)
 \end{itemize}

-\pagebreak
-
 \section{Intermediary language $\intermedlang$}

+A first pass will translate DWARF instructions into this intermediary language
+$\intermedlang$. It is designed to be more mathematical, representing the same
+thing, but abstracting all the data compression of the DWARF format away, so
+that we can better reason on it and transform it into C code.
+
+Its grammar is as follows:
+
 \begin{align*}
-    \FDE &:= {\left(\mathbb{Z} \times \dwrow \right)}^{\ast}
+    \FDE &::= {\left(\mathbb{Z} \times \dwrow \right)}^{\ast}
        & \text{FDE (set of rows)} \\
-    \dwrow &:= \values ^ \regs
-        & \text{A DWARF row} \\
-    \spexpr &:= \regs \times \mathbb{Z}
-        & \text{A ``simple'' expression $\reg{reg} + offset$} \\
-    \regs &:= \left\{0, 1, \ldots, \operatorname{NB\_REGS - 1} \right\}
+    \dwrow &::= \values ^ \regs
+        & \text{A single table row} \\
+    \regs &::= \left\{0, 1, \ldots, \operatorname{NB\_REGS - 1} \right\}
        & \text{Machine registers} \\
-    \values &:= \{\bot\} \cup \valaddr{\spexpr} \cup \valval{\spexpr}
-        & \text{Values: undefined, at address $x$, value $x$} \\
+    \values &::= \bot & \text{Values: undefined,}\\
+        &\quad\vert~\valaddr{\spexpr} & \text{at address $x$},\\
+        &\quad\vert~\valval{\spexpr} & \text{of value $x$} \\
+    \spexpr &::= \regs \times \mathbb{Z}
+        & \text{A ``simple'' expression $\reg{reg} + \textit{offset}$} \\
 \end{align*}

+The entry point of the grammar is a $\FDE$, which is a set of rows, each
+annotated with a machine address, the address from which it is valid. Note that
+the addresses are necessarily increasing within a FDE\@.
+
+Each row then represents, as a function mapping registers to values, a row of
+the unwinding table.
+
+We implicitly consider that $\reg{reg}$ maps to a number, and we use here
+\texttt{x86\_64} names for convenience, but actually in DWARF registers are
+only handled as register identifiers, so we can safely state that $\reg{reg}
+\in \regs$.
+
+A value can then be undefined, stored at memory address $x$ or be directly a
+value $x$, $x$ being here a simple expression consisting of $\reg{reg} +
+\textit{offset}$. The CFA is considered a simple register here. For instance, to
+define $\reg{rax}$ to the value contained in memory 16 bytes below the CFA, we
+would have $\reg{rax} \mapsto \valaddr{\reg{CFA}, -16}$ (for the stack grows
+downwards).
+
+\section{Target language~: a C function body}
+
+The target language of these semantics is a C function, to be interpreted with
+respect to the C11 standard~\cite{c11std}. The function is supposed to be run
+in the context of the program being unwound. In particular, it must be able to
+dereference some pointer derived from DWARF instructions that will point to the
+execution stack, or even the heap.
+
+This function takes as arguments an instruction pointer ---~supposedly
+extracted from $\reg{rip}$~--- and an array of register values; and returns a
+fresh array of register values after unwinding this call frame. The function is
+compositional\footnote{up to technicities: the IP obtained after unwinding the
+first frame might be handled in a different dynamically loaded object, and this
+would require inspecting the DWARF located in another file}: it can be called
+twice in a row to unwind two stack frames.
+
+The function is the following~:
+
+\lstinputlisting[language=C]{src/c_context.c}
+
+The translation of $\intermedlang$ as produced by the later-defined function
+are then to be inserted in this context, where the comment states so.

 \section{From DWARF to $\intermedlang$}

-We define $\semI{\bullet}{s} : \DWARF \times \FDE \to \FDE$, for $s$ a stack of
-$\dwrow$, that is,
+To define the interpretation of $\DWARF$ to $\intermedlang$, we will need to
+proceed forward, but, as the language inherently depends on the previous
+instructions to give a meaning to the following ones, we will depend on what
+was computed before. At a point of the interpretation $h \vert t$, where $t$ is
+what remains to be interpreted, $h$ what has been, and $H$ the result of the
+interpretation, it would thus look like $\llbracket t \rrbracket (H)$.

-\begin{align*}
-    s \in \rowstack &:= \dwrow^\ast
-\end{align*}
+But we also need to keep track of this implicit stack DWARF uses, which will be
+kept in subscript.
+
+\vspace{1em}
+
+Thus, we define $\semI{\bullet}{s}(\bullet) : \DWARF \times \FDE \to \FDE$, for
+$s$ a stack of $\dwrow$, that is,
+\[
+    s \in \rowstack := \dwrow^\ast
+\]

 Implicitly, $\semI{\bullet}{} := \semI{\bullet}{\varepsilon}$

+\vspace{1em}
+
 For convenience, we define $\insarrow{reg}$, the operator changing the value of
 a register for a given value in the last row, as

@ -140,7 +228,9 @@ a register for a given value in the last row, as
 The same way, we define $\extrarrow{reg}$ that \emph{extracts} the rule
 currently applied for $\reg{reg}$, eg. $F \extrarrow{CFA} \valval{\reg{reg} +
 \text{off}}$. If the rule currently applied in such a case is \emph{not} of the
-form $\reg{reg} + \text{off}$, then the program is considered erroneous.
+form $\reg{reg} + \text{off}$, then the program is considered erroneous. One
+can see this $\extrarrow{reg}$ somehow as a \lstc{match} statement in OCaml,
+but with only one case, allowing to retrieve packed data.

 More generally, we define ${\extrarrow{reg}}^{-k}$ as the same operation, but
 extracting in the $k$-older row, ie. ${\extrarrow{reg}}^{0}$ is the same as
@ -168,11 +258,11 @@ $F\left[0 \ldots |F|-2\right] \extrarrow{reg} \bullet$.
    \semI{\dwcfa{undefined(reg)} \cdot d}{s}(F) &:=
        \contsem{F \insarrow{reg} \bot} \\
    \semI{\dwcfa{same\_value(reg)} \cdot d}{s}(F) &:=
-        \valval{\reg{reg}} &\textbf{CHECK ME}\\
+        \valval{\reg{reg}} \\
    \semI{\dwcfa{offset(reg, offset)} \cdot d}{s}(F) &:=
-        \contsem{F \insarrow{reg} \valaddr{\reg{CFA} + \text{offset}}} \\
+        \contsem{F \insarrow{reg} \valaddr{\reg{CFA} + \textit{offset}}} \\
    \semI{\dwcfa{val\_offset(reg, offset)} \cdot d}{s}(F) &:=
-        \contsem{F \insarrow{reg} \valval{\reg{CFA} + \text{offset}}} \\
+        \contsem{F \insarrow{reg} \valval{\reg{CFA} + \textit{offset}}} \\
    \semI{\dwcfa{register(reg, model)} \cdot d}{s}(F) &:=
        \text{let } F {\extrarrow{model}}^{-1} r \text{ in }
        \contsem{F \insarrow{reg} r} \\
@ -189,29 +279,20 @@ $F\left[0 \ldots |F|-2\right] \extrarrow{reg} \bullet$.
    \semI{\dwcfa{nop()} \cdot d}{s}(F) &:= \contsem{F}\\
 \end{align*}

-(The stack is used for \texttt{remember\_state}, \texttt{restore\_state},
-\texttt{restore})
+(The stack is used for \texttt{remember\_state} and \texttt{restore\_state}. If
+we omit those two operations, we can plainly remove the stack).


 \section{From $\intermedlang$ to C}

 \textit{This only defines the semantics, with respect to standard C, of DWARF
-as interpreted by \ehelf\@. It is not implemented this way.}
+as interpreted by \ehelf\@. The actual DWARF to C compiler is not implemented
+this way.}

-We now define $\semC{\bullet} : \DWARF \to C$.
+\vspace{1em}

-These semantics are to be interpreted in the following context~:
-
-\lstinputlisting[language=C]{src/c_context.c}
-
-Then, the function \lstc{unwind_frame}, given an instruction pointer and the
-state of the registers in the previous call frame, returns the state of
-registers after the current call frame is unwinded. In other words, assuming
-the C language is capable of formal operations, a call to
-\lstc{unwind_frame(ip, {reg_0, reg_1, ..., reg_N-1})} should return something
-equivalent to the corresponding DWARF row.
-
-The translation from $\intermedlang$ to C is defined as follows:
+We now define $\semC{\bullet} : \DWARF \to C$, in the context presented
+earlier. The translation from $\intermedlang$ to C is defined as follows:

 \begin{itemize}
    \item $\semC{\varepsilon} =$ \\
@ -222,8 +303,8 @@ The translation from $\intermedlang$ to C is defined as follows:
            }
        \end{lstlisting}

-    \item $\semC{(\text{loc}, \text{row}) \cdot t} = C\_code + \semC{t}$, where
-        $C\_code$ is
+    \item $\semC{(\text{loc}, \text{row}) \cdot t} = C\_code \cdot \semC{t}$,
+        where $C\_code$ is
        \begin{lstlisting}[language=C, mathescape=true]
            if(ip >= $loc$) {
                for(int reg=0; reg < NB_REGS; ++reg)
@ -236,10 +317,11 @@ The translation from $\intermedlang$ to C is defined as follows:
 and $\semR{\bullet}$ is defined as
 \begin{align*}
    \semR{\bot} &= \text{\lstc{ERROR_VALUE}} \\
-    \semR{\valaddr{\text{reg}, \text{offset}}} &=
+    \semR{\valaddr{\text{reg}, \textit{offset}}} &=
        \text{\lstc{*(old_ctx[reg] + offset)}} \\
-    \semR{\valval{\text{reg}, \text{offset}}} &=
+    \semR{\valval{\text{reg}, \textit{offset}}} &=
        \text{\lstc{(old_ctx[reg] + offset)}} \\
 \end{align*}

+\printbibliography{}
 \end{document}
--- a/src/c_context.c
+++ b/src/c_context.c
@ -13,6 +13,5 @@ regs_t unwind_frame(uintptr_t ip, regs_t old_ctx) {
    // ===== INSERT GENERATED CODE HERE =====

 end_ifs:
-
    return new_ctx;
 }