Input previously written dwarf semantics
This commit is contained in:
parent
2d9735b70f
commit
122c69b0ca
2 changed files with 295 additions and 0 deletions
|
@ -225,6 +225,284 @@ the relevant FDE from its start, until it finds the row it was seeking.
|
|||
\todo{}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{DWARF semantics}
|
||||
|
||||
We will now define semantics covering most of the operations used for
|
||||
CFI\footnote{To be defined elsewhere in the report} described in the DWARF
|
||||
standard~\cite{dwarf5std}, with the exception of DWARF expressions. These are
|
||||
not exhaustively treated because they are quite rich and would take a lot of
|
||||
time and space to formalize, and in the meantime are only seldom used (see the
|
||||
DWARF statistics regarding this).
|
||||
|
||||
These semantics are defined with respect to the well-formalized C language, and
|
||||
are passing through an intermediary language. The DWARF language can read the
|
||||
whole memory, as well as registers, and is always executed for some instruction
|
||||
pointer. The C function representing it will thus take as parameters an array
|
||||
of the registers' values as well as an IP, and will return another array of
|
||||
registers values, which will represent the evaluated DWARF row.
|
||||
|
||||
\subsection{Original language~: DWARF instructions}
|
||||
|
||||
These are the DWARF instructions used for CFI description, that is, the
|
||||
instructions that contain the stack unwinding table informations. The following
|
||||
list is an exhaustive list of instructions from the DWARF5
|
||||
specification~\cite{dwarf5std} concerning CFI, with reworded descriptions for
|
||||
brevity and clarity. All these instructions are up to variants (most
|
||||
instructions exist in multiple formats to handle various operands formatting,
|
||||
to optimize space). Since we won't be talking about the underlying file format
|
||||
here, those variations between eg. \dwcfa{advance\_loc1} and
|
||||
\dwcfa{advance\_loc2} ---~which differ only on the number of bytes of their
|
||||
operand~--- are irrelevant and will be eluded.
|
||||
|
||||
\begin{itemize}
|
||||
\item{} \dwcfa{set\_loc(loc)}~:
|
||||
start a new table row from address $loc$
|
||||
\item{} \dwcfa{advance\_loc(delta)}~:
|
||||
start a new table row at address $prev\_loc + delta$
|
||||
\item{} \dwcfa{def\_cfa(reg, offset)}~:
|
||||
sets this row's CFA at $(\reg{reg} + \textit{offset})$
|
||||
\item{} \dwcfa{def\_cfa\_register(reg)}~:
|
||||
sets CFA at $(\reg{reg} + \textit{prev\_offset})$
|
||||
\item{} \dwcfa{def\_cfa\_offset(offset)}~:
|
||||
sets CFA at $(\reg{prev\_reg} + \textit{offset})$
|
||||
\item{} \dwcfa{def\_cfa\_expression(expr)}~:
|
||||
sets CFA as the result of $expr$
|
||||
\item{} \dwcfa{undefined(reg)}~:
|
||||
sets the register \reg{reg} as undefined in this row
|
||||
\item{} \dwcfa{same\_value(reg)}~:
|
||||
declares that the register \reg{reg} hasn't been touched, or was
|
||||
restored to its previous value, in this row. An unwinding procedure can
|
||||
leave it as-is.
|
||||
\item{} \dwcfa{offset(reg, offset)}~:
|
||||
the value of the register \reg{reg} is stored in memory at the address
|
||||
$CFA + \textit{offset}$.
|
||||
\item{} \dwcfa{val\_offset(reg, offset)}~:
|
||||
the value of the register \reg{reg} is the value $CFA + \textit{offset}$
|
||||
\item{} \dwcfa{register(reg, model)}~:
|
||||
the register \reg{reg} has, in this row, the value that $\reg{model}$
|
||||
had in the previous row
|
||||
\item{} \dwcfa{expression(reg, expr)}~:
|
||||
the value of \reg{reg} is stored in memory at the address defined by
|
||||
$expr$
|
||||
\item{} \dwcfa{val\_expression(reg, expr)}~:
|
||||
\reg{reg} has the value of $expr$
|
||||
\item{} \dwcfa{restore(reg)}~:
|
||||
\reg{reg} has the same value as in this FDE's preamble (CIE) in this
|
||||
row. This is \emph{not implemented in this semantics} for simplicity
|
||||
and brevity (we would have to introduce CIE (preamble) and FDE (body)
|
||||
independently). This is also not much used in actual ELF
|
||||
files\footnote{TODO: refer to stats}.
|
||||
\item{} \dwcfa{remember\_state()}~:
|
||||
push the state of all the registers of this row on an implicit stack
|
||||
\item{} \dwcfa{restore\_state()}~:
|
||||
pop an entry of the implicit stack, and restore all registers in this
|
||||
row to the value held in the stack record.
|
||||
\item{} \dwcfa{nop()}~:
|
||||
do nothing (padding)
|
||||
\end{itemize}
|
||||
|
||||
\subsection{Intermediary language $\intermedlang$}
|
||||
|
||||
A first pass will translate DWARF instructions into this intermediary language
|
||||
$\intermedlang$. It is designed to be more mathematical, representing the same
|
||||
thing, but abstracting all the data compression of the DWARF format away, so
|
||||
that we can better reason on it and transform it into C code.
|
||||
|
||||
Its grammar is as follows:
|
||||
|
||||
\begin{align*}
|
||||
\FDE &::= {\left(\mathbb{Z} \times \dwrow \right)}^{\ast}
|
||||
& \text{FDE (set of rows)} \\
|
||||
\dwrow &::= \values ^ \regs
|
||||
& \text{A single table row} \\
|
||||
\regs &::= \left\{0, 1, \ldots, \operatorname{NB\_REGS - 1} \right\}
|
||||
& \text{Machine registers} \\
|
||||
\values &::= \bot & \text{Values: undefined,}\\
|
||||
&\quad\vert~\valaddr{\spexpr} & \text{at address $x$},\\
|
||||
&\quad\vert~\valval{\spexpr} & \text{of value $x$} \\
|
||||
\spexpr &::= \regs \times \mathbb{Z}
|
||||
& \text{A ``simple'' expression $\reg{reg} + \textit{offset}$} \\
|
||||
\end{align*}
|
||||
|
||||
The entry point of the grammar is a $\FDE$, which is a set of rows, each
|
||||
annotated with a machine address, the address from which it is valid. Note that
|
||||
the addresses are necessarily increasing within a FDE\@.
|
||||
|
||||
Each row then represents, as a function mapping registers to values, a row of
|
||||
the unwinding table.
|
||||
|
||||
We implicitly consider that $\reg{reg}$ maps to a number, and we use here
|
||||
\texttt{x86\_64} names for convenience, but actually in DWARF registers are
|
||||
only handled as register identifiers, so we can safely state that $\reg{reg}
|
||||
\in \regs$.
|
||||
|
||||
A value can then be undefined, stored at memory address $x$ or be directly a
|
||||
value $x$, $x$ being here a simple expression consisting of $\reg{reg} +
|
||||
\textit{offset}$. The CFA is considered a simple register here. For instance, to
|
||||
define $\reg{rax}$ to the value contained in memory 16 bytes below the CFA, we
|
||||
would have $\reg{rax} \mapsto \valaddr{\reg{CFA}, -16}$ (for the stack grows
|
||||
downwards).
|
||||
|
||||
\subsection{Target language~: a C function body}
|
||||
|
||||
The target language of these semantics is a C function, to be interpreted with
|
||||
respect to the C11 standard~\cite{c11std}. The function is supposed to be run
|
||||
in the context of the program being unwound. In particular, it must be able to
|
||||
dereference some pointer derived from DWARF instructions that will point to the
|
||||
execution stack, or even the heap.
|
||||
|
||||
This function takes as arguments an instruction pointer ---~supposedly
|
||||
extracted from $\reg{rip}$~--- and an array of register values; and returns a
|
||||
fresh array of register values after unwinding this call frame. The function is
|
||||
compositional\footnote{up to technicities: the IP obtained after unwinding the
|
||||
first frame might be handled in a different dynamically loaded object, and this
|
||||
would require inspecting the DWARF located in another file}: it can be called
|
||||
twice in a row to unwind two stack frames.
|
||||
|
||||
The function is the following~:
|
||||
|
||||
\lstinputlisting[language=C]{src/dw_semantics/c_context.c}
|
||||
|
||||
The translation of $\intermedlang$ as produced by the later-defined function
|
||||
are then to be inserted in this context, where the comment states so.
|
||||
|
||||
\subsection{From DWARF to $\intermedlang$}
|
||||
|
||||
To define the interpretation of $\DWARF$ to $\intermedlang$, we will need to
|
||||
proceed forward, but, as the language inherently depends on the previous
|
||||
instructions to give a meaning to the following ones, we will depend on what
|
||||
was computed before. At a point of the interpretation $h \vert t$, where $t$ is
|
||||
what remains to be interpreted, $h$ what has been, and $H$ the result of the
|
||||
interpretation, it would thus look like $\llbracket t \rrbracket (H)$.
|
||||
|
||||
But we also need to keep track of this implicit stack DWARF uses, which will be
|
||||
kept in subscript.
|
||||
|
||||
\medskip
|
||||
|
||||
Thus, we define $\semI{\bullet}{s}(\bullet) : \DWARF \times \FDE \to \FDE$, for
|
||||
$s$ a stack of $\dwrow$, that is,
|
||||
\[
|
||||
s \in \rowstack := \dwrow^\ast
|
||||
\]
|
||||
|
||||
Implicitly, $\semI{\bullet}{} := \semI{\bullet}{\varepsilon}$
|
||||
|
||||
\medskip
|
||||
|
||||
For convenience, we define $\insarrow{reg}$, the operator changing the value of
|
||||
a register for a given value in the last row, as
|
||||
|
||||
\[
|
||||
\left(f \in \FDE\right) \insarrow{$r \in \regs$} (v \in values)
|
||||
\quad := \quad
|
||||
\left( f\left[0 \ldots |f| - 2\right] \right) \cdot \left\{
|
||||
\begin{array}{r l}
|
||||
r' \neq r &\mapsto \left(f[-1]\right)(r') \\
|
||||
r &\mapsto v \\
|
||||
\end{array} \right.
|
||||
\]
|
||||
|
||||
The same way, we define $\extrarrow{reg}$ that \emph{extracts} the rule
|
||||
currently applied for $\reg{reg}$, eg. $F \extrarrow{CFA} \valval{\reg{reg} +
|
||||
\text{off}}$. If the rule currently applied in such a case is \emph{not} of the
|
||||
form $\reg{reg} + \text{off}$, then the program is considered erroneous. One
|
||||
can see this $\extrarrow{reg}$ somehow as a \lstc{match} statement in OCaml,
|
||||
but with only one case, allowing to retrieve packed data.
|
||||
|
||||
More generally, we define ${\extrarrow{reg}}^{-k}$ as the same operation, but
|
||||
extracting in the $k$-older row, ie. ${\extrarrow{reg}}^{0}$ is the same as
|
||||
$\extrarrow{reg}$, and $F {\extrarrow{reg}}^{-1} \bullet$ is the same as
|
||||
$F\left[0 \ldots |F|-2\right] \extrarrow{reg} \bullet$.
|
||||
|
||||
\begin{align*}
|
||||
\semI{\varepsilon}{s}(F) &:= F \\
|
||||
\semI{\dwcfa{set\_loc(loc)} \cdot d}{s}(F) &:=
|
||||
\contsem{F \cdot \left(loc, F[-1].row \right)} \\
|
||||
\semI{\dwcfa{adv\_loc(delta)} \cdot d}{s}(F) &:=
|
||||
\contsem{F \cdot \left(F[-1].addr + delta, F[-1].row \right)} \\
|
||||
\semI{\dwcfa{def\_cfa(reg, offset)} \cdot d}{s}(F) &:=
|
||||
\contsem{F \insarrow{CFA} \valval{\reg{reg} + offset}} \\
|
||||
\semI{\dwcfa{def\_cfa\_register(reg)} \cdot d}{s}(F) &:=
|
||||
\text{let F }\extrarrow{CFA} \valval{\reg{oldreg} + \text{oldoffset}}
|
||||
\text{ in} \\
|
||||
&\quad \contsem{F \insarrow{CFA} \valval{\reg{reg} + oldoffset}} \\
|
||||
\semI{\dwcfa{def\_cfa\_offset(offset)} \cdot d}{s}(F) &:=
|
||||
\text{let F }\extrarrow{CFA} \valval{\reg{oldreg} + \text{oldoffset}}
|
||||
\text{ in} \\
|
||||
&\quad \contsem{F \insarrow{CFA} \valval{\reg{oldreg} + offset}} \\
|
||||
\semI{\dwcfa{def\_cfa\_expression(expr)} \cdot d}{s}(F) &:=
|
||||
\text{TO BE DEFINED} &\qtodo{CHECK ME?} \\
|
||||
\semI{\dwcfa{undefined(reg)} \cdot d}{s}(F) &:=
|
||||
\contsem{F \insarrow{reg} \bot} \\
|
||||
\semI{\dwcfa{same\_value(reg)} \cdot d}{s}(F) &:=
|
||||
\valval{\reg{reg}} \\
|
||||
\semI{\dwcfa{offset(reg, offset)} \cdot d}{s}(F) &:=
|
||||
\contsem{F \insarrow{reg} \valaddr{\reg{CFA} + \textit{offset}}} \\
|
||||
\semI{\dwcfa{val\_offset(reg, offset)} \cdot d}{s}(F) &:=
|
||||
\contsem{F \insarrow{reg} \valval{\reg{CFA} + \textit{offset}}} \\
|
||||
\semI{\dwcfa{register(reg, model)} \cdot d}{s}(F) &:=
|
||||
\text{let } F {\extrarrow{model}}^{-1} r \text{ in }
|
||||
\contsem{F \insarrow{reg} r} \\
|
||||
\semI{\dwcfa{expression(reg, expr)} \cdot d}{s}(F) &:=
|
||||
\text{TO BE DEFINED} &\qtodo{CHECK ME?}\\
|
||||
\semI{\dwcfa{val\_expression(reg, expr)} \cdot d}{s}(F) &:=
|
||||
\text{TO BE DEFINED} &\qtodo{CHECK ME?}\\
|
||||
% \semI{\dwcfa{restore(reg)} \cdot d}{s}(F) &:= \\ %% NOT IMPLEMENTED
|
||||
\semI{\dwcfa{remember\_state()} \cdot d}{s}(F) &:=
|
||||
\semI{d}{s \cdot F[-1].row}\left(F\right) \\
|
||||
\semI{\dwcfa{restore\_state()} \cdot d}{s \cdot t}(F) &:=
|
||||
\semI{d}{s}\left(F\left[0 \ldots |F|-2\right] \cdot
|
||||
\left(F[-1].addr, t\right) \right) \\
|
||||
\semI{\dwcfa{nop()} \cdot d}{s}(F) &:= \contsem{F}\\
|
||||
\end{align*}
|
||||
|
||||
(The stack is used for \texttt{remember\_state} and \texttt{restore\_state}. If
|
||||
we omit those two operations, we can plainly remove the stack).
|
||||
|
||||
|
||||
\subsection{From $\intermedlang$ to C}
|
||||
|
||||
\textit{This only defines the semantics, with respect to standard C, of DWARF
|
||||
as interpreted by \ehelf\@. The actual DWARF to C compiler is not implemented
|
||||
this way.}
|
||||
|
||||
\medskip
|
||||
|
||||
We now define $\semC{\bullet} : \DWARF \to C$, in the context presented
|
||||
earlier. The translation from $\intermedlang$ to C is defined as follows:
|
||||
|
||||
\begin{itemize}
|
||||
\item $\semC{\varepsilon} =$ \\
|
||||
\begin{lstlisting}[language=C, mathescape=true]
|
||||
else {
|
||||
for(int reg=0; reg < NB_REGS; ++reg)
|
||||
new_ctx[reg] = $\semR{\bot}$;
|
||||
}
|
||||
\end{lstlisting}
|
||||
|
||||
\item $\semC{(\text{loc}, \text{row}) \cdot t} = C\_code \cdot \semC{t}$,
|
||||
where $C\_code$ is
|
||||
\begin{lstlisting}[language=C, mathescape=true]
|
||||
if(ip >= $loc$) {
|
||||
for(int reg=0; reg < NB_REGS; ++reg)
|
||||
new_ctx[reg] = $\semR{row[reg]}$;
|
||||
goto end_ifs; // Avoid if/else if problems
|
||||
}
|
||||
\end{lstlisting}
|
||||
\end{itemize}
|
||||
|
||||
and $\semR{\bullet}$ is defined as
|
||||
\begin{align*}
|
||||
\semR{\bot} &= \text{\lstc{ERROR_VALUE}} \\
|
||||
\semR{\valaddr{\text{reg}, \textit{offset}}} &=
|
||||
\text{\lstc{*(old_ctx[reg] + offset)}} \\
|
||||
\semR{\valval{\text{reg}, \textit{offset}}} &=
|
||||
\text{\lstc{(old_ctx[reg] + offset)}} \\
|
||||
\end{align*}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Stack unwinding data compilation}
|
||||
|
|
17
report/src/dw_semantics/c_context.c
Normal file
17
report/src/dw_semantics/c_context.c
Normal file
|
@ -0,0 +1,17 @@
|
|||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define NB_REGS 32 /* put the number of registers of your platform here */
|
||||
|
||||
typedef uintptr_t* regs_t; // Array of size at least NB_REGS
|
||||
|
||||
regs_t unwind_frame(uintptr_t ip, regs_t old_ctx) {
|
||||
regs_t new_ctx = (regs_t) malloc(sizeof(uintptr_t) * NB_REGS);
|
||||
assert(new_ctx != NULL);
|
||||
|
||||
// ===== INSERT GENERATED CODE HERE =====
|
||||
|
||||
end_ifs:
|
||||
return new_ctx;
|
||||
}
|
Loading…
Reference in a new issue