report/slides/slides.tex

570 lines
18 KiB
TeX

% vim: spell spelllang=en
\documentclass[11pt,xcolor={usenames,dvipsnames}]{beamer}
\usetheme{Warsaw}
\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
\usepackage[T1]{fontenc}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{makecell}
\usepackage{../shared/my_listings}
%\usepackage{../shared/my_hyperref}
\usepackage{../shared/specific}
\usepackage{../shared/common}
\usepackage{../shared/todo}
\renewcommand\theadalign{c}
\renewcommand\theadfont{\scriptsize\bfseries}
\setbeamertemplate{navigation symbols}{}
\newcommand{\thenalert}[1]{\only<1>{#1}\only<2>{\alert{#1}}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author[Théophile Bastian]{Théophile \textsc{Bastian} \\
\small{Under supervision of Francesco Zappa Nardelli}}
\title{Internship defense, MPRI, M2}
\subtitle{Speeding up stack unwinding by compiling DWARF debugging data}
\date{March\ --\ August 2018}
%\subject{}
%\logo{}
\institute{Team PARKAS, INRIA, Paris}
\begin{document}
\begin{frame}
\addtocounter{framenumber}{-1}
\titlepage{}
\begin{center}
Slides: \url{https://tobast.fr/m2/slides.pdf} \\
Report: \url{https://tobast.fr/m2/report.pdf}
\end{center}
\end{frame}
\begin{frame}
\addtocounter{framenumber}{-1}
\tableofcontents
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Stack unwinding data}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Introduction}
\begin{frame}[fragile]{We often use stack unwinding!}
\begin{lstlisting}[language=, numbers=none, escapechar=|]
Program received signal SIGSEGV, Segmentation fault.
0x0000555555554625 in fct_b (m=0x5c) at segfault.c:5
5 printf("%l\n", *m);
|\pause|
(gdb) backtrace
#0 0x0000555555554625 in fct_b (m=0x5c) at segfault.c:5
#1 0x0000555555554663 in fct_a (n=42) at segfault.c:10
#2 0x0000555555554674 in main () at segfault.c:14
|\pause|
(gdb) frame 1
#1 0x0000555555554663 in fct_a (n=42) at segfault.c:10
10 fct_b((int*)(some_fct_a_var + 8));
|\pause|
(gdb) print some_fct_a_var
$1 = 84
\end{lstlisting}
\pause{}
\begin{center}
\textbf{\Large How does it work?!}
\end{center}
\vspace{1em}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Stack frames and unwinding}
\begin{frame}{Call stack and registers}
\begin{columns}[c]
\begin{column}{0.65\textwidth}
\begin{itemize}
\item Programs use a \alert{call stack}
\item Organized in \alert{stack frames}
\begin{itemize}
\item Local variables
\item Function parameters
\item Keep track of nesting, registers and ``return
point''
\end{itemize}
\end{itemize}
Common registers:
\begin{itemize}
\item \reg{rip}: program counter (PC)
\item \reg{rsp}: stack pointer
\item \reg{rbp}: base pointer
\begin{itemize}
\item Saves \reg{rsp}
\item Easy access
\item Wastes a register
\item Not often used (x86\_64)
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.35\textwidth}
\includegraphics[width=0.95\linewidth]{../shared/imgs/call_stack}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Isn't it as trivial as \texttt{pop()}?}
\begin{itemize}
\item This is only a \alert{blob of binary data} without mandatory
structure
\item We ignore \alert{which registers were saved}
\item We ignore \alert{whether \reg{rbp} was used}
\item We ignore \alert{where the return address is stored}
\item We ignore \alert{where the previous frame begins}
\end{itemize}
\medskip
But\ldots{} if we know how to \alert{unwind one}, we can \alert{recurse}!
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{DWARF tables}
\begin{frame}[fragile]{DWARF unwinding data}
\begin{lstlisting}[numbers=none, language=]
00009b30 48 009b34 FDE cie=0000 pc=0084950..0084b37
LOC CFA rbx rbp r12 r13 r14 r15 ra
0084950 rsp+8 u u u u u u c-8
0084952 rsp+16 u u u u u c-16 c-8
0084954 rsp+24 u u u u c-24 c-16 c-8
0084956 rsp+32 u u u c-32 c-24 c-16 c-8
0084958 rsp+40 u u c-40 c-32 c-24 c-16 c-8
0084959 rsp+48 u c-48 c-40 c-32 c-24 c-16 c-8
008495a rsp+56 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084962 rsp+64 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a19 rsp+56 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a1d rsp+48 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a1e rsp+40 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a20 rsp+32 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a22 rsp+24 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a24 rsp+16 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a26 rsp+8 c-56 c-48 c-40 c-32 c-24 c-16 c-8
0084a30 rsp+64 c-56 c-48 c-40 c-32 c-24 c-16 c-8
\end{lstlisting}
\pause{}
\vspace{-4cm}
\hfill\includegraphics[height=3cm, angle=45, origin=c]{img/dwarf_logo}
\hspace{-1cm}
\end{frame}
\begin{frame}[fragile]{The real DWARF}
\begin{lstlisting}[numbers=none, language=]
00009b30 48 009b34 FDE cie=0000 pc=0084950..0084b37
DW_CFA_advance_loc: 2 to 0000000000084952
DW_CFA_def_cfa_offset: 16
DW_CFA_offset: r15 (r15) at cfa-16
DW_CFA_advance_loc: 2 to 0000000000084954
DW_CFA_def_cfa_offset: 24
DW_CFA_offset: r14 (r14) at cfa-24
DW_CFA_advance_loc: 2 to 0000000000084956
DW_CFA_def_cfa_offset: 32
DW_CFA_offset: r13 (r13) at cfa-32
DW_CFA_advance_loc: 2 to 0000000000084958
DW_CFA_def_cfa_offset: 40
DW_CFA_offset: r12 (r12) at cfa-40
DW_CFA_advance_loc: 1 to 0000000000084959
DW_CFA_def_cfa_offset: 48
DW_CFA_offset: r6 (rbp) at cfa-48
DW_CFA_advance_loc: 1 to 000000000008495a
[...]
\end{lstlisting}
\end{frame}
\begin{frame}{Why does slow matter?}
\textbf{Do we really care about speed for unwinding?}
\begin{itemize}
\item{} After all, we're talking about \alert{debugging procedures} ran
by a \alert{human being} (slower than the machine).
\ldots{}or are we?
\pause{}\item{} \alert{Profiling} with polling profilers
\pause{}\item{} \alert{Exception handling} in C++
\end{itemize}
\vspace{2em}
\begin{center}
\textbf{Debug data is not only for debugging}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Compiling DWARF}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Compilation Strategy}
\begin{frame}[fragile]{Types}
Generated data:
\lstinputlisting[language=C]{src/unwind_context.c}
\pause{}
\vspace{1em}
Function type:
\begin{lstlisting}[language=C]
unwind_context_t _eh_elf(
unwind_context_t, instruction_pointer_t); \end{lstlisting}
\end{frame}
\begin{frame}{Compilation overview}
\begin{itemize}
\item Compiled to \alert{C code}
\item C code then \alert{compiled to native binary} (gcc)
\begin{itemize}
\item[$\leadsto$] gcc optimisations for free
\end{itemize}
\item Compiled as \alert{separate \texttt{.so} files}, called \ehelfs{}
\bigskip{}
\item Morally a \alert{monolithic switch} on IPs
\item Each case fills the context structure
\end{itemize}
\end{frame}
\begin{frame}{Compilation choices}
\textbf{In order to keep the compiler \alert{simple} and \alert{easily
testable}, the whole DWARF5 instruction set is not supported.}
\begin{itemize}
\item Tailored for \alert{x86\_64} (while DWARF is
architecture-agnostic)
\item Only supports \alert{unwinding registers}: \reg{rip}, \reg{rsp},
\reg{rbp}, \reg{rbx}
\begin{itemize}
\item[$\leadsto$] suitable for perf, not for gdb
\end{itemize}
\item Supports the \alert{wide majority} ($> 99.9\%$) of instructions
used (see later)
\begin{itemize}
\item Only supports few common expressions: already $~ 90\,\%$
of expressions used
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]{A word on memory maps}
\textbf{Various functions come from various ELFs}\\
\pause{}
\hfill{}\textbf{\ldots{}and thus various \ehelfs{}}
\begin{lstlisting}[language=, numbers=none]
55c81b11e000-55c81b126000 000k /usr/bin/cat
55c81b325000-55c81b326000 007k /usr/bin/cat
55c81b326000-55c81b327000 008k /usr/bin/cat
55c81bcf3000-55c81bd14000 000k [heap]
7f8a5b4ed000-7f8a5b50f000 000k /usr/lib/libc-2.28.so
7f8a5b50f000-7f8a5b65a000 022k /usr/lib/libc-2.28.so
7f8a5b65a000-7f8a5b6a6000 16dk /usr/lib/libc-2.28.so
7f8a5b6ec000-7f8a5b6ee000 000k /usr/lib/ld-2.28.so
7f8a5b6ee000-7f8a5b70d000 002k /usr/lib/ld-2.28.so
7f8a5b70d000-7f8a5b715000 021k /usr/lib/ld-2.28.so
7ffc8a66b000-7ffc8a68c000 000k [stack]
7ffc8a74a000-7ffc8a74d000 000k [vvar]
7ffc8a74d000-7ffc8a74f000 000k [vdso]
\end{lstlisting}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Outlining}
\begin{frame}{Size optimisation: outlining}
\begin{itemize}
\item This \alert{works}, but \alert{takes space}: about \alert{7 times
heavier} than regular DWARF\@.
\item DWARF optimisation strategy: \alert{alter previous row}. \\
Causes slowness: we cannot do that.
\item Remark: a lot of lines appear often.
\begin{itemize}
\item[$\leadsto$] \emph{outline} them!
\end{itemize}
\end{itemize}
\pause{}
\textbf{Outlining:}
\begin{itemize}
\item Turn the big switch into a binary search \alert{if/else tree}
\item \alert{Extract} the conditional bodies, put them afterwards
\item Jump to them using a \alert{label/goto}
\end{itemize}
\pause{}
\bigskip{}
\begin{center}
$\leadsto$ only \textbf{2.5 times heavier than DWARF}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Benchmarking}
\begin{frame}{Benchmarking requirements}
\begin{enumerate}
\item Thousands of samples (single unwind: $10\,\mu{}s$)
\item Interesting enough program to unwind: nested functions, complex
FDEs
\item Mitigate caching: don't always unwind from the \emph{same} point
\item Yet be fair: don't always unwind from totally different places
\item Distribute evenly: if possible, also from within libraries
\end{enumerate}
\pause{}\vspace{1em}
\begin{itemize}
\item 2 $\implies$ exit hand-crafted program, exit CSmith
\item 5 $\implies$ cannot call the unwinding procedure by hand
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Unwinding using perf}
\begin{frame}{Presentation of perf}
\textbf{A profiler is used to\ldots}
\begin{itemize}
\item get readings of the \alert{time spent in each function}
\item detect ``hot paths'': functions you ought to optimize
\item \ldots{}and \alert{benchmark \ehelfs{}}!
\end{itemize}
\vspace{1em}\pause{}
\textbf{How does it work?}
\begin{itemize}
\item{} \alert{Polling profiler}: stops at regular intervals to perform
analyses
\item{} Upon polling, \alert{dumps the stack} to a file
\item{} In the analysis phase (after the program terminated),
\alert{unwinds all the stacks gathered} to get call paths
\end{itemize}
\end{frame}
\begin{frame}{perf instrumentation}
\textbf{Instrumenting perf matches all the requirements!}
\vspace{1em}\pause{}
\begin{itemize}
\item{} \alert{Plug \ehelfs{} into perf}: use \ehelfs{} instead of
DWARF to analyze stack dumps
\item{} Implement \alert{unwinding performance counters} inside perf
\bigskip{}
\item{} Use perf on \alert{hackbench}, a kernel stress-test program
\begin{itemize}
\item Small program
\item Lots of calls
\item Relies on libc, libpthread
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{libunwind implementation}
\begin{itemize}
\item \alert{libunwind}: \textit{de facto} standard library for
unwinding
\item Uses DWARF in background
\item \alert{Used by perf} as a backend for unwinding
\pause{}\vspace{1em} \item{} Easiest way to use \ehelfs{} in perf:
\alert{implement an alternative libunwind}
\item{} Result: \alert{alternative implementation} of libunwind, nearly
plug-and-play!
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Results}
\begin{frame}{Time performance}
\small
\centering
\begin{tabular}{l r r r r r}
\toprule
\thead{Unwinding method} & \thead{Frames \\ unwound}
& \thead{Tot.\ time \\ ($\mu s$)}
& \thead{Avg. \\ time / frame \\ ($ns$)}
& \thead{Time \\ ratio} \\
\midrule
\alert{\ehelfs{}}
& 23506 % Frames unwound
& 14837 % Total time
& 631 % Avg time
& 1
\\
\prog{libunwind}, \alert{cached}
& 27058 % Frames unwound
& 441601 % Total time
& 16320 % Avg time
& \alert{25.9}
\\
\prog{libunwind}, \alert{uncached}
& 27058 % Frames unwound
& 671292 % Total time
& 24809 % Avg time
& \alert{39.3}
\\
\bottomrule
\end{tabular}
\end{frame}
\begin{frame}{Space performance}
\begin{center}
\begin{tabular}{r r r r r r}
\toprule
\thead{Object}
& \thead{Original \\ program size \\ (KiB)}
& \thead{Original \\ \lstc{.eh\_frame} \\ (KiB)}
& \thead{Generated \\ \ehelf{} \\ (KiB)}
& \thead{\% \\ original \\ size}
& \thead{Growth \\ factor} \\
\midrule
libc
& 1\,434 & 130.1 & 313.2 & 21.88 & 2.41 \\
libpthread
& 58.1 & 11.6 & 25.4 & 43.71 & 2.19 \\
ld
& 129.6 & 9.6 & 28.6 & 22.09 & 2.97 \\
hackbench
& 2.9 & 0.555 & 2.8 & 93.87 & 4.99 \\
Total
& 1\,638 & 151.8 & 370.0 & 22.81 & \alert{2.44} \\
\bottomrule
\end{tabular}
\end{center}
\end{frame}
\newcommand{\ofsupp}[1]{$\left[\text{#1}\right]$}
\begin{frame}{Instructions coverage}
\scriptsize\centering
\begin{tabular}{r r r r r r}
\toprule
\thead{}
& \thead{Undef}
& \thead{Same \\ value}
& \thead{Offset}
& \thead{Val \\ offset}
& \thead{Reg}
\\
\midrule
\makecell{Only supp. \\ columns} \vspace{.4em}
& 1698 (0.006\,\%)
& 0
& 30\,M (99.9\,\%)
& 0
& 14 (0\,\%)
\\
All columns
& 1698 (0.003\,\%)
& 0
& 55\,M (99.9\,\%)
& 0
& 22 (0\,\%)
\\
\bottomrule
\toprule
\thead{}
& \thead{Expr \\ \ofsupp{supported}}
& \thead{Val \\ expr}
& \thead{Archi.}
& & \thead{Total}
\\
\midrule
\makecell{Only supp. \\ columns} \vspace{.4em}
& \makecell{4475 (0.015\,\%) \\ \ofsupp{81.4\,\%}}
& 0
& 0
& & 30044442
\\
All columns
& \makecell{12367 (0.02\,\%) \\ \ofsupp{91.7\,\%}}
& 0
& 0
& & 54680492
\\
\bottomrule
\end{tabular}
\vspace{2em}\pause{}
\textbf{Among \alert{4000} randomly sampled filed, only \alert{24}
containing unsupported instructions}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{A glance at future work}
\begin{frame}{What next?}
\begin{itemize}
\item \alert{Outlining} was super efficient for
compactness\ldots{} Worth trying on standard DWARF\@?
\item Implement a release-ready, packageable, easy to use version of
perf with \ehelfs{} and submit it for inclusion
\item{} Measure \alert{C++ exceptions overhead} precisely in common
software
\item{} Implement \alert{\ehelfs{}} support for \alert{C++ runtime}
exception handling
\item{} \ldots{}and many more possibilities to explore!
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{}
\begin{frame}
\begin{columns}[c]
\begin{column}{0.35\textwidth}
\includegraphics[width=\linewidth]{img/keep_breathing}
\end{column}
\begin{column}{0.65\textwidth}
\begin{center}
\huge
And remember\ldots{}
\smallskip
DWARF is slow!
\end{center}
\end{column}
\end{columns}
\begin{center}
\Huge\bfseries
Thank you!
\end{center}
\end{frame}
\end{document}