diff --git a/Makefile b/Makefile index 06afc9e..09ded76 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ all: semantics.pdf -%.pdf: %.tex +%.pdf: %.tex %.bib latexmk -pdf $< upload: semantics.pdf diff --git a/semantics.bib b/semantics.bib new file mode 100644 index 0000000..694cfc7 --- /dev/null +++ b/semantics.bib @@ -0,0 +1,13 @@ +@manual{dwarf5std, + title = {DWARF Debugging Information Format version 5}, + organization = {DWARF Debugging Information Format Committee}, + author = {DWARF5}, + url = {http://dwarfstd.org}, + year = {2017}, +} + +@manual{c11std, + title = {ISO/IEC 9899:2011}, + organization = {International Organization for Standardization}, + author = {C11}, +} diff --git a/semantics.tex b/semantics.tex index 5d6e93d..3dbe6c9 100644 --- a/semantics.tex +++ b/semantics.tex @@ -10,10 +10,13 @@ \usepackage{stmaryrd} \usepackage{mathtools} \usepackage[utf8]{inputenc} +\usepackage[backend=biber,style=alphabetic]{biblatex} \usepackage{my_listings} \usepackage{my_hyperref} +\addbibresource{semantics.bib} + \newcommand{\dwcfa}[1]{\texttt{DW\_CFA\_#1}} \newcommand{\reg}[1]{\%#1} @@ -46,84 +49,169 @@ \begin{document} \maketitle -\section{DWARF CFI instructions} +We will now define semantics covering most of the operations used for +CFI\footnote{To be defined elsewhere in the report} described in the DWARF +standard~\cite{dwarf5std}, with the exception of DWARF expressions. These are +not exhaustively treated because they are quite rich and would take a lot of +time and space to formalize, and in the meantime are only seldom used (see the +DWARF statistics regarding this). -These instructions are up to variants (most instructions exist in multiple -formats to handle various operands formatting, to optimize space). +These semantics are defined with respect to the well-formalized C language, and +are passing through an intermediary language. The DWARF language can read the +whole memory, as well as registers, and is always executed for some instruction +pointer. The C function representing it will thus take as parameters an array +of the registers' values as well as an IP, and will return another array of +registers values, which will represent the evaluated DWARF row. + +\section{Original language~: DWARF instructions} + +These are the DWARF instructions used for CFI description, that is, the +instructions that contain the stack unwinding table informations. The following +list is an exhaustive list of instructions from the DWARF5 +specification~\cite{dwarf5std} concerning CFI, with reworded descriptions for +brevity and clarity. All these instructions are up to variants (most +instructions exist in multiple formats to handle various operands formatting, +to optimize space). Since we won't be talking about the underlying file format +here, those variations between eg. \dwcfa{advance\_loc1} and +\dwcfa{advance\_loc2} ---~which differ only on the number of bytes of their +operand~--- are irrelevant and will be eluded. \begin{itemize} \item{} \dwcfa{set\_loc(loc)}~: - new DWARF row at address $loc$ + start a new table row from address $loc$ \item{} \dwcfa{advance\_loc(delta)}~: - new DWARF row at address $prev\_loc + delta$ + start a new table row at address $prev\_loc + delta$ \item{} \dwcfa{def\_cfa(reg, offset)}~: - sets CFA at $(\reg{reg} + offset)$ + sets this row's CFA at $(\reg{reg} + \textit{offset})$ \item{} \dwcfa{def\_cfa\_register(reg)}~: - sets CFA at $(\reg{reg} + prev\_offset)$ + sets CFA at $(\reg{reg} + \textit{prev\_offset})$ \item{} \dwcfa{def\_cfa\_offset(offset)}~: - sets CFA at $(\reg{prev\_reg} + offset)$ + sets CFA at $(\reg{prev\_reg} + \textit{offset})$ \item{} \dwcfa{def\_cfa\_expression(expr)}~: - defines CFA as the result of $expr$ + sets CFA as the result of $expr$ \item{} \dwcfa{undefined(reg)}~: - sets \reg{reg} undefined in the current (last) row + sets the register \reg{reg} as undefined in this row \item{} \dwcfa{same\_value(reg)}~: - sets \reg{reg} in the current (last) row as it was in the previous row + declares that the register \reg{reg} hasn't been touched, or was + restored to its previous value, in this row. An unwinding procedure can + leave it as-is. \item{} \dwcfa{offset(reg, offset)}~: - \reg{reg} is stored in memory at the address $CFA + offset$ in the - current (last) row + the value of the register \reg{reg} is stored in memory at the address + $CFA + \textit{offset}$. \item{} \dwcfa{val\_offset(reg, offset)}~: - \reg{reg} is the value $CFA + offset$ in the current (last) row + the value of the register \reg{reg} is the value $CFA + \textit{offset}$ \item{} \dwcfa{register(reg, model)}~: - \reg{reg} has in the current (last) row the value that had - $\reg{model}$ in the previous row + the register \reg{reg} has, in this row, the value that $\reg{model}$ + had in the previous row \item{} \dwcfa{expression(reg, expr)}~: - \reg{reg} is stored in memory at the address defined by $expr$ + the value of \reg{reg} is stored in memory at the address defined by + $expr$ \item{} \dwcfa{val\_expression(reg, expr)}~: \reg{reg} has the value of $expr$ \item{} \dwcfa{restore(reg)}~: - \reg{reg} has the same value as in this FDE's preamble (CIE) in the - current (last) row. This is \emph{not implemented in this semantics} - for simplicity and brevity (we would have to introduce CIE (preamble) - and FDE (body) independently). + \reg{reg} has the same value as in this FDE's preamble (CIE) in this + row. This is \emph{not implemented in this semantics} for simplicity + and brevity (we would have to introduce CIE (preamble) and FDE (body) + independently). This is also not much used in actual ELF + files\footnote{TODO: refer to stats}. \item{} \dwcfa{remember\_state()}~: - push the state of all the registers of the current (last) row in an - implicit stack + push the state of all the registers of this row on an implicit stack \item{} \dwcfa{restore\_state()}~: - pop an entry of the implicit stack, and set all registers in the - current (last) row accordingly. + pop an entry of the implicit stack, and restore all registers in this + row to the value held in the stack record. \item{} \dwcfa{nop()}~: do nothing (padding) \end{itemize} -\pagebreak - \section{Intermediary language $\intermedlang$} +A first pass will translate DWARF instructions into this intermediary language +$\intermedlang$. It is designed to be more mathematical, representing the same +thing, but abstracting all the data compression of the DWARF format away, so +that we can better reason on it and transform it into C code. + +Its grammar is as follows: + \begin{align*} - \FDE &:= {\left(\mathbb{Z} \times \dwrow \right)}^{\ast} + \FDE &::= {\left(\mathbb{Z} \times \dwrow \right)}^{\ast} & \text{FDE (set of rows)} \\ - \dwrow &:= \values ^ \regs - & \text{A DWARF row} \\ - \spexpr &:= \regs \times \mathbb{Z} - & \text{A ``simple'' expression $\reg{reg} + offset$} \\ - \regs &:= \left\{0, 1, \ldots, \operatorname{NB\_REGS - 1} \right\} + \dwrow &::= \values ^ \regs + & \text{A single table row} \\ + \regs &::= \left\{0, 1, \ldots, \operatorname{NB\_REGS - 1} \right\} & \text{Machine registers} \\ - \values &:= \{\bot\} \cup \valaddr{\spexpr} \cup \valval{\spexpr} - & \text{Values: undefined, at address $x$, value $x$} \\ + \values &::= \bot & \text{Values: undefined,}\\ + &\quad\vert~\valaddr{\spexpr} & \text{at address $x$},\\ + &\quad\vert~\valval{\spexpr} & \text{of value $x$} \\ + \spexpr &::= \regs \times \mathbb{Z} + & \text{A ``simple'' expression $\reg{reg} + \textit{offset}$} \\ \end{align*} +The entry point of the grammar is a $\FDE$, which is a set of rows, each +annotated with a machine address, the address from which it is valid. Note that +the addresses are necessarily increasing within a FDE\@. + +Each row then represents, as a function mapping registers to values, a row of +the unwinding table. + +We implicitly consider that $\reg{reg}$ maps to a number, and we use here +\texttt{x86\_64} names for convenience, but actually in DWARF registers are +only handled as register identifiers, so we can safely state that $\reg{reg} +\in \regs$. + +A value can then be undefined, stored at memory address $x$ or be directly a +value $x$, $x$ being here a simple expression consisting of $\reg{reg} + +\textit{offset}$. The CFA is considered a simple register here. For instance, to +define $\reg{rax}$ to the value contained in memory 16 bytes below the CFA, we +would have $\reg{rax} \mapsto \valaddr{\reg{CFA}, -16}$ (for the stack grows +downwards). + +\section{Target language~: a C function body} + +The target language of these semantics is a C function, to be interpreted with +respect to the C11 standard~\cite{c11std}. The function is supposed to be run +in the context of the program being unwound. In particular, it must be able to +dereference some pointer derived from DWARF instructions that will point to the +execution stack, or even the heap. + +This function takes as arguments an instruction pointer ---~supposedly +extracted from $\reg{rip}$~--- and an array of register values; and returns a +fresh array of register values after unwinding this call frame. The function is +compositional\footnote{up to technicities: the IP obtained after unwinding the +first frame might be handled in a different dynamically loaded object, and this +would require inspecting the DWARF located in another file}: it can be called +twice in a row to unwind two stack frames. + +The function is the following~: + +\lstinputlisting[language=C]{src/c_context.c} + +The translation of $\intermedlang$ as produced by the later-defined function +are then to be inserted in this context, where the comment states so. \section{From DWARF to $\intermedlang$} -We define $\semI{\bullet}{s} : \DWARF \times \FDE \to \FDE$, for $s$ a stack of -$\dwrow$, that is, +To define the interpretation of $\DWARF$ to $\intermedlang$, we will need to +proceed forward, but, as the language inherently depends on the previous +instructions to give a meaning to the following ones, we will depend on what +was computed before. At a point of the interpretation $h \vert t$, where $t$ is +what remains to be interpreted, $h$ what has been, and $H$ the result of the +interpretation, it would thus look like $\llbracket t \rrbracket (H)$. -\begin{align*} - s \in \rowstack &:= \dwrow^\ast -\end{align*} +But we also need to keep track of this implicit stack DWARF uses, which will be +kept in subscript. + +\vspace{1em} + +Thus, we define $\semI{\bullet}{s}(\bullet) : \DWARF \times \FDE \to \FDE$, for +$s$ a stack of $\dwrow$, that is, +\[ + s \in \rowstack := \dwrow^\ast +\] Implicitly, $\semI{\bullet}{} := \semI{\bullet}{\varepsilon}$ +\vspace{1em} + For convenience, we define $\insarrow{reg}$, the operator changing the value of a register for a given value in the last row, as @@ -140,7 +228,9 @@ a register for a given value in the last row, as The same way, we define $\extrarrow{reg}$ that \emph{extracts} the rule currently applied for $\reg{reg}$, eg. $F \extrarrow{CFA} \valval{\reg{reg} + \text{off}}$. If the rule currently applied in such a case is \emph{not} of the -form $\reg{reg} + \text{off}$, then the program is considered erroneous. +form $\reg{reg} + \text{off}$, then the program is considered erroneous. One +can see this $\extrarrow{reg}$ somehow as a \lstc{match} statement in OCaml, +but with only one case, allowing to retrieve packed data. More generally, we define ${\extrarrow{reg}}^{-k}$ as the same operation, but extracting in the $k$-older row, ie. ${\extrarrow{reg}}^{0}$ is the same as @@ -168,11 +258,11 @@ $F\left[0 \ldots |F|-2\right] \extrarrow{reg} \bullet$. \semI{\dwcfa{undefined(reg)} \cdot d}{s}(F) &:= \contsem{F \insarrow{reg} \bot} \\ \semI{\dwcfa{same\_value(reg)} \cdot d}{s}(F) &:= - \valval{\reg{reg}} &\textbf{CHECK ME}\\ + \valval{\reg{reg}} \\ \semI{\dwcfa{offset(reg, offset)} \cdot d}{s}(F) &:= - \contsem{F \insarrow{reg} \valaddr{\reg{CFA} + \text{offset}}} \\ + \contsem{F \insarrow{reg} \valaddr{\reg{CFA} + \textit{offset}}} \\ \semI{\dwcfa{val\_offset(reg, offset)} \cdot d}{s}(F) &:= - \contsem{F \insarrow{reg} \valval{\reg{CFA} + \text{offset}}} \\ + \contsem{F \insarrow{reg} \valval{\reg{CFA} + \textit{offset}}} \\ \semI{\dwcfa{register(reg, model)} \cdot d}{s}(F) &:= \text{let } F {\extrarrow{model}}^{-1} r \text{ in } \contsem{F \insarrow{reg} r} \\ @@ -189,29 +279,20 @@ $F\left[0 \ldots |F|-2\right] \extrarrow{reg} \bullet$. \semI{\dwcfa{nop()} \cdot d}{s}(F) &:= \contsem{F}\\ \end{align*} -(The stack is used for \texttt{remember\_state}, \texttt{restore\_state}, -\texttt{restore}) +(The stack is used for \texttt{remember\_state} and \texttt{restore\_state}. If +we omit those two operations, we can plainly remove the stack). \section{From $\intermedlang$ to C} \textit{This only defines the semantics, with respect to standard C, of DWARF -as interpreted by \ehelf\@. It is not implemented this way.} +as interpreted by \ehelf\@. The actual DWARF to C compiler is not implemented +this way.} -We now define $\semC{\bullet} : \DWARF \to C$. +\vspace{1em} -These semantics are to be interpreted in the following context~: - -\lstinputlisting[language=C]{src/c_context.c} - -Then, the function \lstc{unwind_frame}, given an instruction pointer and the -state of the registers in the previous call frame, returns the state of -registers after the current call frame is unwinded. In other words, assuming -the C language is capable of formal operations, a call to -\lstc{unwind_frame(ip, {reg_0, reg_1, ..., reg_N-1})} should return something -equivalent to the corresponding DWARF row. - -The translation from $\intermedlang$ to C is defined as follows: +We now define $\semC{\bullet} : \DWARF \to C$, in the context presented +earlier. The translation from $\intermedlang$ to C is defined as follows: \begin{itemize} \item $\semC{\varepsilon} =$ \\ @@ -222,8 +303,8 @@ The translation from $\intermedlang$ to C is defined as follows: } \end{lstlisting} - \item $\semC{(\text{loc}, \text{row}) \cdot t} = C\_code + \semC{t}$, where - $C\_code$ is + \item $\semC{(\text{loc}, \text{row}) \cdot t} = C\_code \cdot \semC{t}$, + where $C\_code$ is \begin{lstlisting}[language=C, mathescape=true] if(ip >= $loc$) { for(int reg=0; reg < NB_REGS; ++reg) @@ -236,10 +317,11 @@ The translation from $\intermedlang$ to C is defined as follows: and $\semR{\bullet}$ is defined as \begin{align*} \semR{\bot} &= \text{\lstc{ERROR_VALUE}} \\ - \semR{\valaddr{\text{reg}, \text{offset}}} &= + \semR{\valaddr{\text{reg}, \textit{offset}}} &= \text{\lstc{*(old_ctx[reg] + offset)}} \\ - \semR{\valval{\text{reg}, \text{offset}}} &= + \semR{\valval{\text{reg}, \textit{offset}}} &= \text{\lstc{(old_ctx[reg] + offset)}} \\ \end{align*} +\printbibliography{} \end{document} diff --git a/src/c_context.c b/src/c_context.c index 6cbad00..77e0d7b 100644 --- a/src/c_context.c +++ b/src/c_context.c @@ -13,6 +13,5 @@ regs_t unwind_frame(uintptr_t ip, regs_t old_ctx) { // ===== INSERT GENERATED CODE HERE ===== end_ifs: - return new_ctx; }