Compare commits
13 commits
review-gui
...
master
Author | SHA1 | Date | |
---|---|---|---|
f1392fcd88 | |||
a901b04298 | |||
9961ced06f | |||
0adff1d484 | |||
c97ab68a0b | |||
4c0a6b272e | |||
225903591b | |||
46be751a37 | |||
3a924a3dea | |||
2950a42bf4 | |||
e174c79369 | |||
d4087865e6 | |||
df7252238e |
26 changed files with 758 additions and 162 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -137,3 +137,4 @@ sympy-plots-for-*.tex/
|
|||
# WinEdt
|
||||
*.bak
|
||||
*.sav
|
||||
*.xdv
|
||||
|
|
8
Makefile
8
Makefile
|
@ -1,6 +1,12 @@
|
|||
all: report
|
||||
all: report slides
|
||||
|
||||
.PHONY: report
|
||||
report:
|
||||
$(MAKE) -C report
|
||||
ln -sf report/report.pdf .
|
||||
|
||||
.PHONY: slides
|
||||
slides:
|
||||
$(MAKE) -C slides
|
||||
ln -sf slides/slides.pdf .
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
all: report.pdf
|
||||
|
||||
report.pdf: report.tex fiche_synthese.tex ../shared/report.bib
|
||||
latexmk -pdf $<
|
||||
latexmk -xelatex -pdf $<
|
||||
|
||||
clean:
|
||||
rm -f *aux *bbl *bcf *blg *_latexmk *fls *log *out *.run.xml
|
||||
|
|
|
@ -8,12 +8,13 @@
|
|||
|
||||
\subsection*{The general context}
|
||||
|
||||
The standard debugging data format, DWARF, contains tables that, for a given
|
||||
instruction pointer (IP), permit to understand how the assembly instruction
|
||||
relates to the source code, where variables are currently allocated in memory
|
||||
or if they are stored in a register, what are their type and how to unwind the
|
||||
current stack frame. This information is generated when passing \eg{} the
|
||||
switch \lstbash{-g} to \prog{gcc} or equivalents.
|
||||
The standard debugging data format, DWARF (Debugging With Attributed Record
|
||||
Formats), contains tables permitting, for a given instruction pointer (IP), to
|
||||
understand how instructions from the assembly code relates to the original
|
||||
source code, where are variables currently allocated in memory or if they are
|
||||
stored in a register, what are their type and how to unwind the current stack
|
||||
frame. This information is generated when passing \eg{} the switch \lstbash{-g}
|
||||
to \prog{gcc} or equivalents.
|
||||
|
||||
Even in stripped (non-debug) binaries, a small portion of DWARF data remains:
|
||||
the stack unwinding data. This information is necessary to unwind stack
|
||||
|
@ -28,7 +29,7 @@ Section~\ref{ssec:instr_cov}~\textendash, consisting in offsets from memory
|
|||
addresses stored in registers (such as \reg{rbp} or \reg{rsp}). Yet, the
|
||||
standard defines rules that take the form of a stack-machine expression that
|
||||
can access virtually all the process's memory and perform Turing-complete
|
||||
computation~\cite{oakley2011exploiting}.
|
||||
computations~\cite{oakley2011exploiting}.
|
||||
|
||||
\subsection*{The research problem}
|
||||
|
||||
|
@ -73,8 +74,8 @@ of compiled DWARF into existing projects have been made easy by implementing an
|
|||
alternative version of the \textit{de facto} standard library for this purpose,
|
||||
\prog{libunwind}.
|
||||
|
||||
Multiple approaches have been tried and evaluated to determine which
|
||||
compilation process leads to the best time/space trade-off.
|
||||
We explored and evaluated multiple approaches to determine which compilation
|
||||
process leads to the best time/space trade-off.
|
||||
|
||||
Unexpectedly, the part that proved hardest of the project was finding and
|
||||
implementing a benchmarking protocol that was both relevant and reliable.
|
||||
|
@ -83,8 +84,8 @@ few samples (around $10\,\mu s$ per frame) to avoid statistical errors. Having
|
|||
enough samples for this purpose --~at least a few thousands~-- is not easy,
|
||||
since one must avoid unwinding the same frame over and over again, which would
|
||||
only benchmark the caching mechanism. The other problem is to distribute
|
||||
evenly the unwinding measures across the various IPs, including directly into
|
||||
the loaded libraries (\eg{} the \prog{libc}).
|
||||
evenly the unwinding measures across the various IPs, among which those
|
||||
directly located into the loaded libraries (\eg{} the \prog{libc}).
|
||||
The solution eventually chosen was to modify \prog{perf}, the standard
|
||||
profiling program for Linux, in order to gather statistics and benchmarks of
|
||||
its unwindings. Modifying \prog{perf} was an additional challenge that turned
|
||||
|
@ -128,10 +129,10 @@ the reference implementation. Indeed, corner cases occur often, and on a 27000
|
|||
samples test, 885 failures were observed for \prog{libunwind}, against 1099 for
|
||||
the compiled DWARF version (see Section~\ref{ssec:timeperf}).
|
||||
|
||||
The implementation, however, is not production-ready: it only supports the
|
||||
The implementation, however, is not yet production-ready: it only supports the
|
||||
x86\_64 architecture, and relies to some extent on the Linux operating system.
|
||||
None of those are real problems in practice. Supporting other processor
|
||||
architectures and ABIs are only a matter of engineering,. The operating system
|
||||
None of these pose a fundamental problem. Supporting other processor
|
||||
architectures and ABIs are only a matter of engineering. The operating system
|
||||
dependency is only present in the libraries developed in order to interact with
|
||||
the compiled unwinding data, which can be developed for virtually any operating
|
||||
system.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
\author{Th\'eophile Bastian\\
|
||||
Under supervision of Francesco Zappa Nardelli, March -- August 2018\\
|
||||
{\textsc{parkas}, \'Ecole Normale Sup\'erieure de Paris}}
|
||||
{\textsc{parkas}, \textsc{inria}}}
|
||||
|
||||
%\date{March -- August 2018\\August 20, 2018}
|
||||
\date{\vspace{-2em}}
|
||||
|
@ -108,7 +108,7 @@ the location of the return address. Then, the compiler might use \reg{rbp}
|
|||
the function, and allows for easy addressing of local variables. To some
|
||||
extents, it also allows for hot debugging, such as saving a useful core dump
|
||||
upon segfault. Yet, using \reg{rbp} to save \reg{rip} wastes a register, and
|
||||
the decision of using it is, on x86\_64 System V, up to the compiler.
|
||||
the decision of using it, on x86\_64 System V, is up to the compiler.
|
||||
|
||||
Usually, a function starts by subtracting some value to \reg{rsp}, allocating
|
||||
some space in the stack frame for its local variables. Then, it saves on the
|
||||
|
@ -150,7 +150,7 @@ compiler is free to do as it wishes. Even worse, it is not trivial to know
|
|||
callee-saved registers were at all, since if the function does not alter a
|
||||
register, it does not have to save it.
|
||||
|
||||
With this example, it seems pretty clear tha some additional data is necessary
|
||||
With this example, it seems pretty clear that some additional data is necessary
|
||||
to perform stack unwinding reliably, without only performing a guesswork. This
|
||||
data is stored along with the debugging information of a program, and one
|
||||
common format of debugging data is DWARF\@.
|
||||
|
@ -218,22 +218,23 @@ that is, $300\,\text{ms}$ per second of program run with default settings.
|
|||
|
||||
One of the causes that inspired this internship were also Stephen Kell's
|
||||
\prog{libcrunch}~\cite{kell2016libcrunch}, which makes a heavy use of stack
|
||||
unwinding through \prog{libunwind} and was forced to force \prog{gcc} to use a
|
||||
frame pointer (\reg{rbp}) everywhere through \lstbash{-fno-omit-frame-pointer}
|
||||
in order to mitigate the slowness.
|
||||
unwinding through \prog{libunwind} and had to force \prog{gcc} to use a frame
|
||||
pointer (\reg{rbp}) everywhere through \lstbash{-fno-omit-frame-pointer} in
|
||||
order to mitigate the slowness.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{DWARF format}
|
||||
|
||||
The DWARF format was first standardized as the format for debugging information
|
||||
of the ELF executable binaries, which are standard on UNIX-like systems,
|
||||
including Linux and MacOS --~but not Windows. It is now commonly used across a
|
||||
wide variety of binary formats to store debugging information. As of now, the
|
||||
latest DWARF standard is DWARF 5~\cite{dwarf5std}, which is openly accessible.
|
||||
of the ELF executable binaries (Extensible Linking Format), which are standard
|
||||
on UNIX-like systems, including Linux and MacOS --~but not Windows. It is now
|
||||
commonly used across a wide variety of binary formats to store debugging
|
||||
information. As of now, the latest DWARF standard is DWARF 5~\cite{dwarf5std},
|
||||
which is openly accessible.
|
||||
|
||||
The DWARF data commonly includes type information about the variables in the
|
||||
original programming language, correspondence of assembly instructions with a
|
||||
line in the original source file, \ldots
|
||||
line in the original source file, \ldots{}
|
||||
The format also specifies a way to represent unwinding data, as described in
|
||||
Section~\ref{ssec:stack_unwinding} above, in an ELF section originally called
|
||||
\lstc{.debug_frame}, but most often found as \ehframe.
|
||||
|
@ -397,27 +398,21 @@ parse the relevant FDE from its start, until it finds the row it was seeking.
|
|||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{DWARF semantics}\label{sec:semantics}
|
||||
|
||||
We now define semantics covering the operations used for FDEs described in the
|
||||
DWARF standard~\cite{dwarf5std}, such as seen in Listing~\ref{lst:ex1_dwraw},
|
||||
with the exception of DWARF expressions. These are not treated here, because
|
||||
they form a rich language and would take a lot of time and space to formalize,
|
||||
while in the mean time being seldom used --~see Section~\ref{ssec:instr_cov}.
|
||||
The DWARF 5 standard~\cite{dwarf5std} is written in English prose, and our
|
||||
first task is to formalize it. Thus, in this section, we first recall the
|
||||
informal behaviour of DWARF instructions as provided by the standard; and then
|
||||
we formalize their semantics by mapping them to well-defined C code. We omit
|
||||
the translation of DWARF expressions, because they form a rich language and
|
||||
would take a lot of time and space to formalize, while in the mean time being
|
||||
seldom used --~see Section~\ref{ssec:instr_cov}.
|
||||
|
||||
These semantics are defined \wrt{} the well-formalized C language, and
|
||||
are passing through an intermediary language. The DWARF language can read the
|
||||
are passing through an intermediate language. The DWARF language can read the
|
||||
whole memory, as well as registers, and is always executed for some instruction
|
||||
pointer. The C function representing it thus takes as parameters an array
|
||||
of the registers' values as well as an IP, and returns another array of
|
||||
registers values, which represents the evaluated DWARF row.
|
||||
|
||||
\subsection{Concerning correctness}\label{ssec:sem_correctness}
|
||||
|
||||
The semantics described in this section are designed in a concern of
|
||||
\emph{formalization} of the original standard. This standard, sadly, only
|
||||
describes in plain English each instruction's action and result. This basis
|
||||
cannot be used to \emph{prove} anything correct without relying on informal
|
||||
interpretations.
|
||||
|
||||
\subsection{Original language: DWARF instructions}
|
||||
|
||||
These are the DWARF instructions used for CFI description, that is, the
|
||||
|
@ -486,7 +481,7 @@ a language.
|
|||
|
||||
\subsection{Intermediary language $\intermedlang$}
|
||||
|
||||
A first pass translates DWARF instructions into this intermediary language
|
||||
A first pass translates DWARF instructions into this intermediate language
|
||||
$\intermedlang$. It is designed to be more mathematical, representing the same
|
||||
thing, but abstracting all the data compression of the DWARF format away, so
|
||||
that we can better reason on it and transform it into C code.
|
||||
|
@ -503,7 +498,7 @@ Its grammar is as follows:
|
|||
\values &::= \bot & \text{Values: undefined,}\\
|
||||
&\quad\vert~\valaddr{\spexpr} & \text{at address $x$},\\
|
||||
&\quad\vert~\valval{\spexpr} & \text{of value $x$} \\
|
||||
&\quad\vert~\valexpr{??} & \text{of expression $x$, see in text} \\
|
||||
&\quad\vert~\valexpr{} & \text{of expression $x$, see in text} \\
|
||||
\spexpr &::= \regs \times \mathbb{Z}
|
||||
& \text{A ``simple'' expression $\reg{reg} + \textit{offset}$} \\
|
||||
\end{align*}
|
||||
|
@ -614,7 +609,7 @@ $f$. If we consider the fictive following fictive row $R_0$,
|
|||
\end{array}\right.
|
||||
\]
|
||||
|
||||
then, we would have
|
||||
\noindent{}then, we would have
|
||||
|
||||
\[
|
||||
R \insarrow{\reg{rbx}} \left(\valaddr{\reg{rip - 24}}\right)
|
||||
|
@ -701,7 +696,7 @@ if(ip >= $loc$) {
|
|||
} \end{lstlisting}
|
||||
\end{itemize}
|
||||
|
||||
while $\semR{\bullet}$ is defined as
|
||||
\noindent{}\noindent{}while $\semR{\bullet}$ is defined as
|
||||
\begin{align*}
|
||||
\semR{\bot} &\eqspace{}
|
||||
\text{\lstc{ERROR_VALUE}} \\
|
||||
|
@ -711,6 +706,16 @@ while $\semR{\bullet}$ is defined as
|
|||
\text{\lstc{(old_ctx[reg] + offset)}} \\
|
||||
\end{align*}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Concerning correctness}\label{ssec:sem_correctness}
|
||||
|
||||
The semantics described in this section are designed in a concern of
|
||||
\emph{formalization} of the original standard. This standard, sadly, only
|
||||
describes in plain English each instruction's action and result. This basis
|
||||
cannot be used to \emph{prove} anything correct without relying on informal
|
||||
interpretations.
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Stack unwinding data compilation}
|
||||
|
@ -721,12 +726,12 @@ actual C implementation.
|
|||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Code availability}\label{ssec:code_avail}
|
||||
|
||||
All the code produced during this internship is available on the various
|
||||
repositories from \url{https://git.tobast.fr/m2-internship/}. The repositories
|
||||
contain \texttt{README} files describing them; a summary and global description
|
||||
can be found in the \texttt{abstract} repository. This should be detailed
|
||||
enough to run the project. The source code is entirely under free software
|
||||
licenses.
|
||||
All the code produced during the course of this internship is available on the
|
||||
various repositories from \url{https://git.tobast.fr/m2-internship/}. The
|
||||
repositories contain \texttt{README} files describing them; a summary and
|
||||
global description can be found in the \texttt{abstract} repository. This
|
||||
should be detailed enough to run the project. The source code is entirely under
|
||||
free software licenses.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Compilation: \ehelfs}\label{ssec:ehelfs}
|
||||
|
@ -772,8 +777,12 @@ would do after a \lstbash{frame n} command. Yet, if one was to enhance the
|
|||
code to handle every register, it would not be much harder and would probably
|
||||
be only a few hours worth of code refactoring and rewriting.
|
||||
|
||||
\lstinputlisting[language=C, caption={Unwinding context}, label={lst:unw_ctx}]
|
||||
{src/dwarf_assembly_context/unwind_context.c}
|
||||
\begin{figure}[h]
|
||||
\centering{}
|
||||
\lstinputlisting[language=C, caption={Unwinding context},
|
||||
label={lst:unw_ctx}]
|
||||
{src/dwarf_assembly_context/unwind_context.c}
|
||||
\end{figure}
|
||||
|
||||
In the unwind context from Listing~\ref{lst:unw_ctx}, the values of type
|
||||
\lstc{uintptr_t} are the values of the corresponding registers, and
|
||||
|
@ -804,10 +813,11 @@ scattered among various \ehelf{} files, one for each shared object loaded
|
|||
unwinder must first acquire a \emph{memory map}, a table listing the various
|
||||
ELF files loaded and \emph{mapped} in memory, and on which memory segment. This
|
||||
memory map is provided by the operating system --~for instance, on Linux, it is
|
||||
available as a file in \texttt{/proc}. Once this map is acquired, when
|
||||
unwinding from a given IP, the unwinder must identify the memory segment from
|
||||
which it comes, deduce the source ELF file, and deduce the corresponding
|
||||
\ehelf.
|
||||
available as a file in \texttt{/proc}, a special part of the file system that
|
||||
the kernel uses to communicate with the userland processes. Once this map is
|
||||
acquired, when unwinding from a given IP, the unwinder must identify the memory
|
||||
segment from which it comes, deduce the source ELF file, and deduce the
|
||||
corresponding \ehelf.
|
||||
|
||||
\medskip
|
||||
|
||||
|
@ -830,7 +840,7 @@ well on the standard cases that are easily tested, and can be used to unwind
|
|||
the stack of simple programs.
|
||||
|
||||
The major drawback of this approach, without any particular care taken, is the
|
||||
space waste. The space taken by those tentative \ehelfs{} is analyzed in
|
||||
waste of space. The space taken by those tentative \ehelfs{} is analyzed in
|
||||
Table~\ref{table:basic_eh_elf_space} for \prog{hackbench}, a small program
|
||||
introduced later in Section~\ref{ssec:bench_perf}, and the libraries on which
|
||||
it depends.
|
||||
|
@ -873,21 +883,21 @@ the original program size ($65\,\%$).
|
|||
|
||||
A lot of small space optimizations, such as filtering out empty FDEs, merging
|
||||
together the rows that are equivalent on all the registers kept, etc.\ were
|
||||
made in order to shrink the \ehelfs.
|
||||
made in order to shrink the size of the \ehelfs.
|
||||
|
||||
\medskip
|
||||
|
||||
The major optimization that most reduced the output size was to use an if/else
|
||||
tree implementing a binary search on the instruction pointer relevant
|
||||
intervals, instead of a single monolithic switch. In the process, we also
|
||||
\emph{outline} code whenever possible, that is, find out identical ``switch
|
||||
cases'' bodies --~which are not switch cases anymore, but \texttt{if}
|
||||
bodies~--, move them outside of the if/else tree, identify them by a label, and
|
||||
jump to them using a \lstc{goto}, which de-duplicates a lot of code and
|
||||
contributes greatly to the shrinking. In the process, we noticed that the vast
|
||||
majority of FDE rows are actually taken among very few ``common'' FDE rows. For
|
||||
instance, in the \prog{libc}, out of a total of $20827$ rows, only $302$
|
||||
($1.5\,\%$) unique rows remain after the outlining.
|
||||
The optimization that most reduced the output size was to use an if/else tree
|
||||
implementing a binary search on the instruction pointer relevant intervals,
|
||||
instead of a single monolithic switch. In the process, we also \emph{outline}
|
||||
code whenever possible, that is, find out identical ``switch cases'' bodies
|
||||
--~which are not switch cases anymore, but \texttt{if} bodies~--, move them
|
||||
outside of the if/else tree, identify them by a label, and jump to them using a
|
||||
\lstc{goto}, which de-duplicates a lot of code and contributes greatly to the
|
||||
shrinking. In the process, we noticed that the vast majority of FDE rows are
|
||||
actually taken among very few ``common'' FDE rows. For instance, in the
|
||||
\prog{libc}, out of a total of $20827$ rows, only $302$ ($1.5\,\%$) unique rows
|
||||
remain after the outlining.
|
||||
|
||||
This makes this optimization really efficient, as seen later in
|
||||
Section~\ref{ssec:results_size}, but also makes it an interesting question
|
||||
|
@ -995,7 +1005,8 @@ The program that was chosen for \prog{perf}-benchmarking is
|
|||
\prog{hackbench}~\cite{hackbenchsrc}. This small program is designed to
|
||||
stress-test and benchmark the Linux scheduler by spawning processes or threads
|
||||
that communicate with each other. It has the interest of generating stack
|
||||
activity, be linked against \prog{libc} and \prog{pthread}, and be very light.
|
||||
activity, being linked against \prog{libc} and \prog{pthread}, and being very
|
||||
light.
|
||||
|
||||
\medskip
|
||||
|
||||
|
@ -1055,7 +1066,8 @@ CSmith code is notoriously hard to understand and edit.
|
|||
All the measures in this report were made on a computer with an Intel Xeon
|
||||
E3-1505M v6 CPU, with a clock frequency of $3.00$\,GHz and 8 cores. The
|
||||
computer has 32\,GB of RAM, and care was taken never to fill it and start
|
||||
swapping.
|
||||
swapping --~using the hard drive to store data instead of the RAM when it is
|
||||
full, degrading harshly the performance.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Measured time performance}\label{ssec:timeperf}
|
||||
|
@ -1120,7 +1132,8 @@ The compilation time of \ehelfs{} is also reasonable. On the machine
|
|||
described in Section~\ref{ssec:bench_hw}, and without using multiple cores to
|
||||
compile, the various shared objects needed to run \prog{hackbench} --~that is,
|
||||
\prog{hackbench}, \prog{libc}, \prog{ld} and \prog{libpthread}~-- are compiled
|
||||
in an overall time of $25.28$ seconds.
|
||||
in an overall time of $25.28$ seconds, which a developer is probably prepared
|
||||
to wait for.
|
||||
|
||||
The unwinding errors observed are hard to investigate, but are most probably
|
||||
due to truncated stack records. Indeed, since \prog{perf} dumps the last $n$
|
||||
|
@ -1178,7 +1191,7 @@ registers represent most columns --~see Section~\ref{ssec:instr_cov}.
|
|||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Instructions coverage}\label{ssec:instr_cov}
|
||||
|
||||
In order to determine which DWARF instructions are necessary to implement to
|
||||
In order to determine which DWARF instructions should be implemented to
|
||||
have meaningful results, as well as to assess the instruction coverage of our
|
||||
compiler and \ehelfs, we must look at real-world ELF files and inspect the
|
||||
instructions used.
|
||||
|
@ -1292,6 +1305,39 @@ It is also worth noting that among all of the 4000 analyzed files, all the
|
|||
unsupported expressions are clustered in only 12 of them, and only 24 contained
|
||||
unsupported instructions at all.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section*{Conclusion}
|
||||
|
||||
From this data, we can deduce that
|
||||
|
||||
\begin{itemize}[itemsep=3pt, parsep=0pt]
|
||||
|
||||
\item compilation of the DWARF unwinding data is effective to speed up
|
||||
drastically unwinding procedures: speedup of $\times 25.9$;
|
||||
|
||||
\item code outlining is effective to reduce the produced binary size: from
|
||||
$1\ \text{MiB}$ to $370\ \text{KiB}$, from a growth factor of $7$
|
||||
compared to DWARF unwinding data to a growth factor of $2.45$;
|
||||
|
||||
\item unwinding relies on small subset of DWARF instructions and
|
||||
expressions, while most instructions are not used at all in DWARF code
|
||||
produced by compilers.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
The overall size of the project is
|
||||
|
||||
\begin{itemize}[itemsep=3pt, parsep=0pt]
|
||||
\item compiler: 1628 lines,
|
||||
\item \prog{libunwind}: 810 lines,
|
||||
\item \prog{perf}: 222 lines
|
||||
\end{itemize}
|
||||
|
||||
\noindent{} for a total of 2660 lines of code on the main project. The various
|
||||
statistics, benchmarking, testing and analyzing code modules add up to around
|
||||
1500 more lines.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%%% End main text content %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
|
|
@ -1,86 +0,0 @@
|
|||
# fiche_synthese.tex:
|
||||
|
||||
l11 : Toujours donner le vrai nom quand tu donnes un acronyme pour la première fois.
|
||||
l12 : s/permit/permits non ?
|
||||
l12 : the assembly instruction, explique un peu. Là on sait pas si tu parles de l'assemblée nationale ou bien d'une instruction assembleur :P
|
||||
l13 : where are variables currently located
|
||||
l14 : explique rapidement ce que veut dire unwind ?
|
||||
l31 : s/computation/computations non ?
|
||||
|
||||
l76 : have been tested il me semble.
|
||||
l87-88 : Je comprends pas. C'est pas included plutôt que including ?
|
||||
Dans cette suite de paragraphes : Beaucoup de redondance, regarde les débuts
|
||||
des deux derniers : The implmentation is not yet release-ready et juste après
|
||||
The implementation is not production-ready. Pareil pour les infos des deux §
|
||||
précédents, j'ai l'impression de lire 2 fois de suite la même chose.
|
||||
|
||||
l134 : supprime la , ou le . qui sont adjacents.
|
||||
|
||||
# report.tex
|
||||
|
||||
l77 : s/whose/which/
|
||||
l88 : onto the stack in reversed order iirc.
|
||||
l111 : the decision of using it is up to the compiler on x86\_64 System V
|
||||
l113 : s/subtracting/substracting
|
||||
l153 : s/tha/that
|
||||
l195 : donner un lien vers la stack-unwinding library ?
|
||||
l215 pour le torbrowser, euh tu fais ça comment ? Parce c'est quand même
|
||||
majoritairement un gros bout de python autour d'un firefox un tout petit peu
|
||||
moddé. Il n'y a pas un autre exemple ?
|
||||
l221 : s/was forced to/ had to/ pour éviter la redondance non ?
|
||||
l229 : idem que pour DWARF, donner le vrai nom de l'acronyme.
|
||||
l236-237 : ,\ldots{} et non , \ldots. L'avantage de passer l'argument vide,
|
||||
c'est que ça te mets le bon espacement après :P
|
||||
|
||||
## subsection 1.6 : How big are FDEs?
|
||||
|
||||
C'est très (trop ?) court. Si tu peux mettre l'histogramme avec pour compléter
|
||||
ça serait pas mal, là on se demande à quel point il est utile de faire une
|
||||
subsection pour ça.
|
||||
|
||||
## subsection 1.7 : Unwinding state-of-the-art
|
||||
|
||||
Pareil trop court je pense. Tu peux pas reparler un peu de C++ et de leur
|
||||
version de l'unwinding ? Ou alors, vite fait parler du caching pour qu'on
|
||||
comprenne ce qu'il se passe.
|
||||
|
||||
l555 : conciseness plutôt que brevity non ? brevity est assez peu courant.
|
||||
l736 : justifie un peu pourquoi -O2 et pas -O3 éventuellement. On comprend pas
|
||||
vraiment pourquoi tu ne fais pas du -O3.
|
||||
|
||||
Dans cette subsection aussi, il y a un travail pour ordonner les case du switch
|
||||
selon les fréquences d'apparition des instructions DWARF ou bien c'est juste en
|
||||
ordre croissant ?
|
||||
|
||||
l775 ; le bout de code collé à gauche, c'est un peu tout moche non ?
|
||||
|
||||
l807 : un néophyte ne sait pas à quoi correspond /proc. Tu perds tou·te·s les
|
||||
catégoricien·ne·s là ^^
|
||||
l833 : the waste of space je pense. Là ça fait bizarre à lire.
|
||||
l876 : \ehelfs file ou alors to shrink the size of \eh_elfs
|
||||
l880 : the major optimization that reduced ou bien The optimization that most
|
||||
reduced non ?
|
||||
l922 : Si c'est zasy to prove, pourquoi ne pas le mettre, même en annexe ? À
|
||||
moins que tu laisses un hint pour une question facile ?
|
||||
l986 : natural enough setup, tu veux pas plutôt mettre legitimate ou un truc
|
||||
dans ce genre ? On trouve pas ce genre de setup dans une forêt ou au fond d'un
|
||||
lac ^^
|
||||
l997-998 : concordance : being linked et being very light. Tu te réfère
|
||||
toujours à l'intérêt du début de la phrase.
|
||||
|
||||
l1003 : and to implement.
|
||||
l1011 : vanilla version, c'est standard en anglais t'es sûr ?
|
||||
|
||||
l1058 : start swappping -> explique ce que c'est, encore une fois tu as des
|
||||
béotiens devant toi ^^
|
||||
|
||||
l119-1123 : Ok, joli nombre mais ça compare à quoi ? c'est combien sans, etc ?
|
||||
l1125 : are most probably, je comprends pas. faut réorganiser les mots "are most probably due to"
|
||||
ou alors en supprimer un mais là c'est weird.
|
||||
|
||||
l1181 : should be implemented plutôt que are necessary to implement je pense.
|
||||
|
||||
Ça manque d'une conclusion je pense.
|
||||
|
||||
Remarque générale : listings, j'aime pas, je trouve ça moche, et la coloration syntaxique est un peu nulle.
|
||||
Je préfère largement minted, ça utilise pygmentize en background, et c'est assez magique :3
|
1
shared/imgs/call_stack.png
Symbolic link
1
shared/imgs/call_stack.png
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../report/imgs/call_stack/call_stack.png
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
\newcommand{\valaddr}[1]{\operatorname{Addr}\left(#1\right)}
|
||||
\newcommand{\valval}[1]{\operatorname{Val}\left(#1\right)}
|
||||
\newcommand{\valexpr}[1]{\operatorname{Expr}\left(#1\right)}
|
||||
\newcommand{\valexpr}{\operatorname{Expr}}
|
||||
|
||||
\newcommand{\intermedlang}{\mathcal{I}}
|
||||
|
||||
|
|
|
@ -8,4 +8,4 @@
|
|||
\newcommand{\qtodo}[1]{\colorbox{todobg}{\textcolor{todofg}{#1}}}
|
||||
\newcommand{\todo}[1]{\qtodo{\textbf{TODO:}\,#1}}
|
||||
\newcommand{\qnote}[1]{\colorbox{notebg}{\textcolor{notefg}{#1}}}
|
||||
\newcommand{\note}[1]{\qnote{\textbf{NOTE:}\,#1}}
|
||||
\newcommand{\tnote}[1]{\qnote{\textbf{NOTE:}\,#1}}
|
||||
|
|
5
slides/Makefile
Normal file
5
slides/Makefile
Normal file
|
@ -0,0 +1,5 @@
|
|||
all:
|
||||
latexmk -xelatex -pdf slides.tex
|
||||
|
||||
clean:
|
||||
rm -f *aux *bbl *bcf *blg *_latexmk *fls *log *out *.run.xml
|
BIN
slides/img/dw_spec.png
Normal file
BIN
slides/img/dw_spec.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 602 KiB |
BIN
slides/img/dwarf_logo.png
Normal file
BIN
slides/img/dwarf_logo.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.5 KiB |
BIN
slides/img/keep_breathing.jpg
Normal file
BIN
slides/img/keep_breathing.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 226 KiB |
BIN
slides/img/stack/call_stack.png
Normal file
BIN
slides/img/stack/call_stack.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
BIN
slides/img/stack/call_stack.xcf
Normal file
BIN
slides/img/stack/call_stack.xcf
Normal file
Binary file not shown.
524
slides/slides.tex
Normal file
524
slides/slides.tex
Normal file
|
@ -0,0 +1,524 @@
|
|||
% vim: spell spelllang=en
|
||||
|
||||
\documentclass[11pt,xcolor={usenames,dvipsnames}]{beamer}
|
||||
\usetheme{Warsaw}
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[english]{babel}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{amsmath}
|
||||
\usepackage{amsfonts}
|
||||
\usepackage{amssymb}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{makecell}
|
||||
\usepackage{ifthen}
|
||||
\usepackage{colortbl}
|
||||
|
||||
\usepackage{../shared/my_listings}
|
||||
%\usepackage{../shared/my_hyperref}
|
||||
\usepackage{../shared/specific}
|
||||
\usepackage{../shared/common}
|
||||
\usepackage{../shared/todo}
|
||||
|
||||
\usepackage{inconsolata}
|
||||
\lstset{basicstyle=\footnotesize\ttfamily}
|
||||
|
||||
\renewcommand\theadalign{c}
|
||||
\renewcommand\theadfont{\scriptsize\bfseries}
|
||||
|
||||
\setbeamertemplate{navigation symbols}{}
|
||||
\setbeamertemplate{headline}{}
|
||||
|
||||
\newcommand{\thenalert}[1]{\only<1>{#1}\only<2>{\alert{#1}}}
|
||||
\newcommand{\slidecountline}{
|
||||
\ifthenelse{\theframenumber = 0}
|
||||
{}
|
||||
{\insertframenumber/\inserttotalframenumber}}
|
||||
\newcommand{\sectionline}{
|
||||
\ifthenelse{\thesection = 0}
|
||||
{}
|
||||
{\Roman{section}~-- \insertsection}}
|
||||
|
||||
\AtBeginSection[]{
|
||||
\begin{frame}
|
||||
\vfill
|
||||
\centering
|
||||
\begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
|
||||
\usebeamerfont{title}\insertsectionhead\par%
|
||||
\end{beamercolorbox}
|
||||
\vfill
|
||||
\end{frame}
|
||||
}
|
||||
|
||||
\lstdefinelanguage{gdb}{
|
||||
morekeywords={gdb},
|
||||
sensitive=false,
|
||||
}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\author[\slidecountline]{Théophile \textsc{Bastian} \\
|
||||
\small{Under supervision of Francesco Zappa Nardelli}}
|
||||
\title[\sectionline]
|
||||
{Speeding up stack unwinding by compiling DWARF debug data}
|
||||
\date{March\ --\ August 2018}
|
||||
%\subject{}
|
||||
%\logo{}
|
||||
\institute{Team PARKAS, INRIA, Paris}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\begin{frame}
|
||||
\addtocounter{framenumber}{-1}
|
||||
\titlepage{}
|
||||
|
||||
\vspace{-2em}
|
||||
\begin{center}
|
||||
\begin{align*}
|
||||
\text{Slides: } &\text{\url{https://tobast.fr/m2/slides.pdf}} \\
|
||||
\text{Report: } &\text{\url{https://tobast.fr/m2/report.pdf}}
|
||||
\end{align*}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{~}
|
||||
\addtocounter{framenumber}{-1}
|
||||
\tableofcontents[hideallsubsections]
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Stack unwinding data}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Introduction}
|
||||
|
||||
\begin{frame}[fragile]{We often use stack unwinding!}
|
||||
\begin{columns}[c]
|
||||
\begin{column}{0.70\textwidth}
|
||||
\begin{lstlisting}[language=gdb, numbers=none, escapechar=|]
|
||||
Program received signal SIGSEGV.
|
||||
0x54625 in fct_b at segfault.c:5
|
||||
5 printf("%l\n", *b);
|
||||
|
||||
|\pause| (gdb) backtrace
|
||||
#0 0x54625 in fct_b at segfault.c:5
|
||||
#1 0x54663 in fct_a at segfault.c:10
|
||||
#2 0x54674 in main at segfault.c:14
|
||||
|
||||
|\pause| (gdb) frame 1
|
||||
#1 0x54663 in fct_a at segfault.c:10
|
||||
10 fct_b((int*) a);
|
||||
|
||||
|\pause| (gdb) print a
|
||||
$1 = 84
|
||||
\end{lstlisting}
|
||||
\vspace{-1em}
|
||||
\pause{}
|
||||
\begin{center}
|
||||
\textbf{\Large How does it work?!}
|
||||
\end{center}
|
||||
\end{column}
|
||||
\begin{column}{0.35\textwidth}
|
||||
\pause{}
|
||||
\includegraphics[width=0.95\linewidth]{img/stack/call_stack}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Stack frames and unwinding}
|
||||
|
||||
\begin{frame}{Call stack and registers}
|
||||
\begin{columns}[c]
|
||||
\begin{column}{0.55\textwidth}
|
||||
\begin{center}
|
||||
\large\bf
|
||||
How do we get the grandparent RA\@?
|
||||
|
||||
\medskip
|
||||
|
||||
Isn't it as trivial as \texttt{pop()}?
|
||||
|
||||
\vspace{2em}
|
||||
|
||||
\only<2>{We only have \reg{rsp} and \reg{rip}.}
|
||||
|
||||
\end{center}
|
||||
\end{column}
|
||||
\begin{column}{0.45\textwidth}
|
||||
\includegraphics[width=0.95\linewidth]{img/stack/call_stack}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{DWARF tables}
|
||||
|
||||
\newcolumntype{a}{>{\columncolor{RedOrange}}l}
|
||||
|
||||
\begin{frame}{DWARF unwinding data}
|
||||
\vspace{2em}
|
||||
\tt \footnotesize
|
||||
\begin{tabular}{
|
||||
>{\columncolor{YellowGreen}}l
|
||||
>{\columncolor{Thistle}}l
|
||||
l l l l l l
|
||||
>{\columncolor{Apricot}}l}
|
||||
~LOC & CFA & rbx & rbp & r12 & r13 & r14 & r15 & ra \\
|
||||
0084950 & rsp+8 & u & u & u & u & u & u & c-8 \\
|
||||
0084952 & rsp+16 & u & u & u & u & u & c-16 & c-8 \\
|
||||
0084954 & rsp+24 & u & u & u & u & c-24 & c-16 & c-8 \\
|
||||
0084956 & rsp+32 & u & u & u & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084958 & rsp+40 & u & u & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084959 & rsp+48 & u & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
\rowcolor{Aquamarine} 008495a & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084962 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a19 & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a1d & rsp+48 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a1e & rsp+40 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a20 & rsp+32 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a22 & rsp+24 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a24 & rsp+16 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a26 & rsp+8 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
0084a30 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
|
||||
\end{tabular}
|
||||
|
||||
\pause{}
|
||||
|
||||
\vspace{-3cm}
|
||||
\hfill\includegraphics[height=3cm, angle=45, origin=c]{img/dwarf_logo}
|
||||
\hspace{-1cm}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[t, fragile]{The real DWARF}
|
||||
\begin{lstlisting}[numbers=none, language=]
|
||||
00009b30 48 009b34 FDE cie=0000 pc=0084950..0084b37
|
||||
DW_CFA_advance_loc: 2 to 0000000000084952
|
||||
DW_CFA_def_cfa_offset: 16
|
||||
DW_CFA_offset: r15 (r15) at cfa-16
|
||||
DW_CFA_advance_loc: 2 to 0000000000084954
|
||||
DW_CFA_def_cfa_offset: 24
|
||||
DW_CFA_offset: r14 (r14) at cfa-24
|
||||
DW_CFA_advance_loc: 2 to 0000000000084956
|
||||
DW_CFA_def_cfa_offset: 32
|
||||
DW_CFA_offset: r13 (r13) at cfa-32
|
||||
DW_CFA_advance_loc: 2 to 0000000000084958
|
||||
DW_CFA_def_cfa_offset: 40
|
||||
DW_CFA_offset: r12 (r12) at cfa-40
|
||||
DW_CFA_advance_loc: 1 to 0000000000084959
|
||||
[...]
|
||||
\end{lstlisting}
|
||||
|
||||
\begin{itemize}
|
||||
\item[\textbf{$\longrightarrow$}] \textbf{\alert{constructed} on-demand
|
||||
by a \alert{Turing-complete bytecode}!}
|
||||
\end{itemize}
|
||||
|
||||
\pause{}
|
||||
|
||||
\vspace{-5.5cm}
|
||||
\begin{center}
|
||||
\bf \fontsize{8cm}{1cm}\colorbox{white}{\alert{Slow!}}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Why does slow matter?}
|
||||
\begin{itemize}
|
||||
|
||||
\item{} After all, we're talking about \alert{debugging procedures} ran
|
||||
by a \alert{human being} (slower than the machine).
|
||||
|
||||
\ldots{}or are we?
|
||||
\end{itemize}
|
||||
|
||||
\pause{}
|
||||
\begin{center}
|
||||
\textbf{\Large{}No!}
|
||||
\end{center}
|
||||
|
||||
\begin{itemize}
|
||||
\pause{}\item{} Pretty much any \alert{program analysis tool}
|
||||
\pause{}\item{} \alert{Profiling} with polling profilers
|
||||
|
||||
\pause{}\item{} \alert{Exception handling} in C++
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\vspace{2em}
|
||||
|
||||
\begin{center}
|
||||
\textbf{\Large{}Debug data is not only for debugging}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Compiling stack unwinding data ahead-of-time}
|
||||
|
||||
\subsection*{}
|
||||
|
||||
\begin{frame}{Compilation overview}
|
||||
\begin{itemize}
|
||||
\item Compiled to \alert{C code}
|
||||
\item C code then \alert{compiled to native binary} (gcc)
|
||||
\begin{itemize}
|
||||
\item[$\leadsto$] gcc optimisations for free
|
||||
\end{itemize}
|
||||
\item Compiled as \alert{separate \texttt{.so} files}, called \ehelfs{}
|
||||
\bigskip{}
|
||||
\item Morally a \alert{monolithic switch} on IPs
|
||||
\item Each case contains assembly that computes a \alert{row of the
|
||||
table}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Example}
|
||||
|
||||
\begin{frame}{Compilation example: original C, DWARF}
|
||||
\lstinputlisting[language=C]{src/fib7/fib7.cfde}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[shrink]{Compilation example: generated C}
|
||||
\lstinputlisting[language=C]{src/fib7/fib7.eh_elf_basic.c}
|
||||
\end{frame}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Compilation Strategy}
|
||||
|
||||
\begin{frame}{Compilation choices}
|
||||
\textbf{In order to keep the compiler \alert{simple} and \alert{easily
|
||||
testable}, the whole DWARF5 instruction set is not supported.}
|
||||
|
||||
\begin{itemize}
|
||||
\item Focus on \alert{x86\_64}
|
||||
\item Focus on unwinding return address \\
|
||||
\vspace{0.3ex}
|
||||
$\leadsto$ \textit{Allows building a backtrace}
|
||||
\begin{itemize}
|
||||
\item \alert{suitable for perf, not for gdb}
|
||||
\item Only supports \alert{unwinding registers}: \reg{rip}, \reg{rsp},
|
||||
\reg{rbp}, \reg{rbx}
|
||||
\item Supports the \alert{wide majority} ($> 99.9\%$) of instructions
|
||||
used
|
||||
\item Among \alert{4000} randomly sampled filed, only \alert{24}
|
||||
containing unsupported instructions
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Interface: libunwind}
|
||||
\begin{itemize}
|
||||
\item \alert{libunwind}: \textit{de facto} standard library for
|
||||
unwinding
|
||||
\item Relies on DWARF
|
||||
|
||||
\bigskip{}
|
||||
|
||||
\item \texttt{libunwind-eh\_elf}: alternative implementation using
|
||||
\ehelfs{}
|
||||
|
||||
\item[$\leadsto$] \alert{alternative implementation} of libunwind,
|
||||
almost plug-and-play for existing projects!
|
||||
\begin{itemize}
|
||||
\item[$\leadsto$] It is \alert{easy} to use \ehelfs{}: just
|
||||
link against the right library!
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Outlining}
|
||||
|
||||
\begin{frame}{Size optimisation: outlining}
|
||||
\begin{itemize}
|
||||
\item This \alert{works}, but \alert{takes space}: about \alert{7 times
|
||||
larger in size} than regular DWARF\@.
|
||||
|
||||
\item DWARF optimisation strategy: \alert{alter previous row}. \\
|
||||
Causes slowness: we cannot do that.
|
||||
|
||||
\item Remark: a lot of lines appear often.
|
||||
\begin{itemize}
|
||||
\item[$\leadsto$] \textbf{\emph{outline} them!}
|
||||
\end{itemize}
|
||||
|
||||
\pause{}
|
||||
|
||||
\item On libc, $20\,827$ rows $\rightarrow$ $302$ outlined ($1.5\,\%$)
|
||||
\item Turn the big switch into a binary search \alert{if/else tree}
|
||||
\end{itemize}
|
||||
|
||||
\pause{}
|
||||
|
||||
\bigskip{}
|
||||
\begin{center}
|
||||
$\leadsto$ only \textbf{2.5 times bigger than DWARF}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Example with outlining}
|
||||
\lstinputlisting[language=C]{src/fib7/fib7.eh_elf_outline.c}
|
||||
\end{frame}
|
||||
|
||||
\subsection{A word on formalization}
|
||||
|
||||
\begin{frame}[t]{A word on formalization}
|
||||
\begin{itemize}
|
||||
\item First task: \alert{writing semantics} for DWARF, written as
|
||||
mapping to C code.
|
||||
\item DWARF5 specification: \alert{plain English}, no proper semantics
|
||||
\item Compiled code is in substance equivalent to semantics
|
||||
\item What remains to prove is mostly \alert{simple or classic
|
||||
optimisations}
|
||||
\end{itemize}
|
||||
|
||||
\pause{}
|
||||
\vspace{-3cm}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.8\linewidth, angle=10]{img/dw_spec.png}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Benchmarking}
|
||||
|
||||
\begin{frame}{Benchmarking requirements}
|
||||
\begin{enumerate}
|
||||
\item Thousands of samples (single unwind: $10\,\mu{}s$)
|
||||
\item Interesting enough program to unwind: nested functions, complex
|
||||
FDEs
|
||||
\item Mitigate caching: don't always unwind from the \emph{same} point
|
||||
\item Yet be fair: don't always unwind from totally different places
|
||||
\item Distribute evenly: if possible, also from within libraries
|
||||
\end{enumerate}
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\begin{frame}{perf instrumentation}
|
||||
\textbf{\alert{perf} is the state-of-the-art polling profiler for Linux.}
|
||||
\begin{itemize}
|
||||
\item{} used to get readings of the time spent in each function
|
||||
\item{} works by regularly stopping the program, unwinding its stack,
|
||||
then aggregating the gathered data
|
||||
\end{itemize}
|
||||
|
||||
\pause{}\bigskip{}
|
||||
\textbf{Instrumenting perf matches all the requirements!}
|
||||
|
||||
\begin{itemize}
|
||||
\item{} \alert{Plug \ehelfs{} into perf}: use \ehelfs{} instead of
|
||||
DWARF to unwind the stack
|
||||
\item{} Implement \alert{unwinding performance counters} inside perf
|
||||
\bigskip{}
|
||||
|
||||
\item{} Use perf on \alert{hackbench}, a kernel stress-test program
|
||||
\begin{itemize}
|
||||
\item Small program
|
||||
\item Lots of calls
|
||||
\item Relies on libc, libpthread
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Results}
|
||||
|
||||
\begin{frame}{Time performance}
|
||||
\small
|
||||
\centering
|
||||
\begin{tabular}{l r r r r r}
|
||||
\toprule
|
||||
\thead{Unwinding method} & \thead{Frames \\ unwound}
|
||||
& \thead{Tot.\ time \\ ($\mu s$)}
|
||||
& \thead{Avg. \\ time / frame \\ ($ns$)}
|
||||
& \thead{Time \\ ratio} \\
|
||||
\midrule
|
||||
\alert{\ehelfs{}}
|
||||
& 23506 % Frames unwound
|
||||
& 14837 % Total time
|
||||
& 631 % Avg time
|
||||
& 1
|
||||
\\
|
||||
\prog{libunwind}, \alert{cached}
|
||||
& 27058 % Frames unwound
|
||||
& 441601 % Total time
|
||||
& 16320 % Avg time
|
||||
& \alert{25.9}
|
||||
\\
|
||||
\prog{libunwind}, \alert{uncached}
|
||||
& 27058 % Frames unwound
|
||||
& 671292 % Total time
|
||||
& 24809 % Avg time
|
||||
& \alert{39.3}
|
||||
\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Space performance}
|
||||
\begin{center}
|
||||
\begin{tabular}{r r r r r r}
|
||||
\toprule
|
||||
\thead{Object}
|
||||
& \thead{\% of binary size}
|
||||
& \thead{Growth factor} \\
|
||||
\midrule
|
||||
libc
|
||||
& 21.88 & 2.41 \\
|
||||
libpthread
|
||||
& 43.71 & 2.19 \\
|
||||
ld
|
||||
& 22.09 & 2.97 \\
|
||||
hackbench
|
||||
& 93.87 & 4.99 \\
|
||||
\midrule
|
||||
Total
|
||||
& 22.81 & \alert{2.44} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section*{}
|
||||
\setcounter{section}{0}
|
||||
|
||||
\begin{frame}{What next?}
|
||||
\begin{itemize}
|
||||
\item Implement a release-ready, packageable, easy to use version of
|
||||
perf with \ehelfs{} and submit it for inclusion
|
||||
|
||||
\item{} Measure \alert{C++ exceptions overhead} precisely in common
|
||||
software
|
||||
|
||||
\item{} Implement \alert{\ehelfs{}} support for \alert{C++ runtime}
|
||||
exception handling, and other systems where unwinding is a
|
||||
performance bottleneck
|
||||
|
||||
\medskip
|
||||
|
||||
\item \alert{Outlining} was effective for
|
||||
compactness\ldots{} Try outlining DWARF bytecode\@?
|
||||
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\begin{frame}
|
||||
\vspace{5mm}
|
||||
\includegraphics[width=\linewidth]{img/keep_breathing}
|
||||
\vspace{-1cm}
|
||||
|
||||
\begin{center}
|
||||
\large
|
||||
\begin{align*}
|
||||
\textbf{Slides: } &\text{\url{https://tobast.fr/m2/slides.pdf}} \\
|
||||
\textbf{Report: } &\text{\url{https://tobast.fr/m2/report.pdf}}
|
||||
\end{align*}
|
||||
\end{center}
|
||||
|
||||
\end{frame}
|
||||
|
||||
\end{document}
|
BIN
slides/src/fib7/fib7.bin
Executable file
BIN
slides/src/fib7/fib7.bin
Executable file
Binary file not shown.
17
slides/src/fib7/fib7.c
Normal file
17
slides/src/fib7/fib7.c
Normal file
|
@ -0,0 +1,17 @@
|
|||
#include <stdio.h>
|
||||
|
||||
void fib7() {
|
||||
int fibo[8];
|
||||
fibo[0] = 1;
|
||||
fibo[1] = 1;
|
||||
for(int pos = 2; pos < 8; ++pos)
|
||||
fibo[pos] =
|
||||
fibo[pos - 1]
|
||||
+ fibo[pos - 2];
|
||||
printf("%d\n", fibo[7]);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
fib7();
|
||||
return 0;
|
||||
}
|
11
slides/src/fib7/fib7.cfde
Normal file
11
slides/src/fib7/fib7.cfde
Normal file
|
@ -0,0 +1,11 @@
|
|||
DWARF
|
||||
CFA ra
|
||||
void fib7() { 0x615 rsp+8 c-8
|
||||
int fibo[8]; 0x620 rsp+48 c-8
|
||||
fibo[0] = 1;
|
||||
fibo[1] = 1;
|
||||
for(...)
|
||||
...
|
||||
printf("%d\n", fibo[7]);
|
||||
0x659 rsp+8 c-8
|
||||
}
|
15
slides/src/fib7/fib7.eh_elf_basic.c
Normal file
15
slides/src/fib7/fib7.eh_elf_basic.c
Normal file
|
@ -0,0 +1,15 @@
|
|||
unwind_context_t _eh_elf(
|
||||
unwind_context_t ctx, uintptr_t pc)
|
||||
{
|
||||
unwind_context_t out_ctx;
|
||||
switch(pc) {
|
||||
...
|
||||
case 0x615 ... 0x618:
|
||||
out_ctx.rsp = ctx.rsp + 8;
|
||||
out_ctx.rip =
|
||||
*((uintptr_t*)(out_ctx.rsp - 8));
|
||||
out_ctx.flags = 3u;
|
||||
return out_ctx;
|
||||
...
|
||||
}
|
||||
}
|
21
slides/src/fib7/fib7.eh_elf_outline.c
Normal file
21
slides/src/fib7/fib7.eh_elf_outline.c
Normal file
|
@ -0,0 +1,21 @@
|
|||
unwind_context_t _eh_elf(
|
||||
unwind_context_t ctx, uintptr_t pc)
|
||||
{
|
||||
unwind_context_t out_ctx;
|
||||
if(pc < 0x619) { ... }
|
||||
else {
|
||||
if(pc < 0x659) { // IP=0x619 ... 0x658
|
||||
goto _factor_1;
|
||||
}
|
||||
...
|
||||
}
|
||||
|
||||
_factor_1:
|
||||
out_ctx.rsp = ctx.rsp + (48);
|
||||
out_ctx.rip = *((uintptr_t*)(out_ctx.rsp + (-8)));
|
||||
out_ctx.flags = 3u;
|
||||
|
||||
...
|
||||
|
||||
return out_ctx;
|
||||
}
|
5
slides/src/fib7/fib7.fde
Normal file
5
slides/src/fib7/fib7.fde
Normal file
|
@ -0,0 +1,5 @@
|
|||
[...] FDE [...] pc=615..65a
|
||||
LOC CFA ra
|
||||
0000000000000615 rsp+8 c-8
|
||||
0000000000000619 rsp+48 c-8
|
||||
0000000000000659 rsp+8 c-8
|
7
slides/src/fib7/fib7.raw_fde
Normal file
7
slides/src/fib7/fib7.raw_fde
Normal file
|
@ -0,0 +1,7 @@
|
|||
[...] FDE [...] pc=615..65a
|
||||
DW_CFA_def_cfa: r7 (rsp) ofs 8
|
||||
DW_CFA_offset: r16 (rip) at cfa-8
|
||||
DW_CFA_advance_loc: 4 to 0619
|
||||
DW_CFA_def_cfa_offset: 48
|
||||
DW_CFA_advance_loc1: 64 to 0659
|
||||
DW_CFA_def_cfa_offset: 8
|
18
slides/src/fib7/fib7.s
Normal file
18
slides/src/fib7/fib7.s
Normal file
|
@ -0,0 +1,18 @@
|
|||
0000000000000615 <fib7>:
|
||||
615: sub $0x28,%rsp ; Alloc stack
|
||||
619: movl $0x1,(%rsp) ; fibo[0]
|
||||
620: movl $0x1,0x4(%rsp) ; fibo[1]
|
||||
628: mov %rsp,%rax ; BEGIN FOR
|
||||
62b: lea 0x18(%rax),%rcx
|
||||
62f: mov (%rax),%edx
|
||||
631: add 0x4(%rax),%edx
|
||||
634: mov %edx,0x8(%rax)
|
||||
637: add $0x4,%rax
|
||||
63b: cmp %rcx,%rax
|
||||
63e: jne 62f <fib7+0x1a> ; END FOR
|
||||
640: mov 0x1c(%rsp),%esi
|
||||
644: lea 0xb9(%rip),%rdi
|
||||
64b: mov $0x0,%eax
|
||||
650: callq 520 <printf@plt>
|
||||
655: add $0x28,%rsp ; Restore rsp
|
||||
659: retq
|
BIN
slides/src/fib7/fib7.st.bin
Executable file
BIN
slides/src/fib7/fib7.st.bin
Executable file
Binary file not shown.
4
slides/src/unwind_context.c
Normal file
4
slides/src/unwind_context.c
Normal file
|
@ -0,0 +1,4 @@
|
|||
typedef struct {
|
||||
uint8_t flags; // State (registers filled, error)
|
||||
uintptr_t rip, rsp, rbp, rbx; // Registers' values
|
||||
} unwind_context_t;
|
Loading…
Reference in a new issue