talk-2019-10-OOPSLA19/slides.tex

524 lines
16 KiB
TeX

% vim: spell spelllang=en
\documentclass[11pt,xcolor={usenames,dvipsnames}]{beamer}
\usetheme{Warsaw}
\usepackage[utf8]{inputenc}
\usepackage[english]{babel}
\usepackage[T1]{fontenc}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{makecell}
\usepackage{ifthen}
\usepackage{colortbl}
\usepackage{texlib/my_listings}
%\usepackage{texlib/my_hyperref}
\usepackage{texlib/specific}
\usepackage{texlib/common}
\usepackage{texlib/todo}
\usepackage{inconsolata}
\lstset{basicstyle=\footnotesize\ttfamily}
\renewcommand\theadalign{c}
\renewcommand\theadfont{\scriptsize\bfseries}
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{headline}{}
\newcommand{\thenalert}[1]{\only<1>{#1}\only<2>{\alert{#1}}}
\newcommand{\slidecountline}{
\ifthenelse{\theframenumber = 0}
{}
{\insertframenumber/\inserttotalframenumber}}
\newcommand{\sectionline}{
\ifthenelse{\thesection = 0}
{}
{\Roman{section}~-- \insertsection}}
\AtBeginSection[]{
\begin{frame}
\vfill
\centering
\begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
\usebeamerfont{title}\insertsectionhead\par%
\end{beamercolorbox}
\vfill
\end{frame}
}
\lstdefinelanguage{gdb}{
morekeywords={gdb},
sensitive=false,
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author[\slidecountline]{Théophile \textsc{Bastian} \\
\footnotesize{under the supervision of Francesco Zappa Nardelli}}
\title[\sectionline]
{Speeding up stack unwinding by compiling DWARF debug data}
\date{March\ --\ August 2018}
%\subject{}
%\logo{}
\institute{Team PARKAS, INRIA, Paris}
\begin{document}
\begin{frame}
\addtocounter{framenumber}{-1}
\titlepage{}
\vspace{-2em}
\begin{center}
\begin{align*}
\text{Slides: } &\text{\url{https://tobast.fr/m2/slides.pdf}} \\
\text{Report: } &\text{\url{https://tobast.fr/m2/report.pdf}}
\end{align*}
\end{center}
\end{frame}
\begin{frame}{~}
\addtocounter{framenumber}{-1}
\tableofcontents[hideallsubsections]
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Stack unwinding data}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Introduction}
\begin{frame}[fragile]{We often use stack unwinding!}
\begin{columns}[c]
\begin{column}{0.70\textwidth}
\begin{lstlisting}[language=gdb, numbers=none, escapechar=|]
Program received signal SIGSEGV.
0x54625 in fct_b at segfault.c:5
5 printf("%l\n", *b);
|\pause| (gdb) backtrace
#0 0x54625 in fct_b at segfault.c:5
#1 0x54663 in fct_a at segfault.c:10
#2 0x54674 in main at segfault.c:14
|\pause| (gdb) frame 1
#1 0x54663 in fct_a at segfault.c:10
10 fct_b((int*) a);
|\pause| (gdb) print a
$1 = 84
\end{lstlisting}
\vspace{-1em}
\pause{}
\begin{center}
\textbf{\Large How does it work?!}
\end{center}
\end{column}
\begin{column}{0.35\textwidth}
\pause{}
\includegraphics[width=0.95\linewidth]{img/call_stack}
\end{column}
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Stack frames and unwinding}
\begin{frame}{Call stack and registers}
\begin{columns}[c]
\begin{column}{0.55\textwidth}
\begin{center}
\large\bf
How do we get the grandparent RA\@?
\medskip
Isn't it as trivial as \texttt{pop()}?
\vspace{2em}
\only<2>{We only have \reg{rsp} and \reg{rip}.}
\end{center}
\end{column}
\begin{column}{0.45\textwidth}
\includegraphics[width=0.95\linewidth]{img/call_stack}
\end{column}
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{DWARF tables}
\newcolumntype{a}{>{\columncolor{RedOrange}}l}
\begin{frame}{DWARF unwinding data}
\vspace{2em}
\tt \footnotesize
\begin{tabular}{
>{\columncolor{YellowGreen}}l
>{\columncolor{Thistle}}l
l l l l l l
>{\columncolor{Apricot}}l}
~LOC & CFA & rbx & rbp & r12 & r13 & r14 & r15 & ra \\
0084950 & rsp+8 & u & u & u & u & u & u & c-8 \\
0084952 & rsp+16 & u & u & u & u & u & c-16 & c-8 \\
0084954 & rsp+24 & u & u & u & u & c-24 & c-16 & c-8 \\
0084956 & rsp+32 & u & u & u & c-32 & c-24 & c-16 & c-8 \\
0084958 & rsp+40 & u & u & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084959 & rsp+48 & u & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
\rowcolor{Aquamarine} 008495a & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084962 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a19 & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a1d & rsp+48 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a1e & rsp+40 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a20 & rsp+32 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a22 & rsp+24 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a24 & rsp+16 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a26 & rsp+8 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
0084a30 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\
\end{tabular}
\pause{}
\vspace{-3cm}
\hfill\includegraphics[height=3cm, angle=45, origin=c]{img/dwarf_logo}
\hspace{-1cm}
\end{frame}
\begin{frame}[t, fragile]{The real DWARF}
\begin{lstlisting}[numbers=none, language=]
00009b30 48 009b34 FDE cie=0000 pc=0084950..0084b37
DW_CFA_advance_loc: 2 to 0000000000084952
DW_CFA_def_cfa_offset: 16
DW_CFA_offset: r15 (r15) at cfa-16
DW_CFA_advance_loc: 2 to 0000000000084954
DW_CFA_def_cfa_offset: 24
DW_CFA_offset: r14 (r14) at cfa-24
DW_CFA_advance_loc: 2 to 0000000000084956
DW_CFA_def_cfa_offset: 32
DW_CFA_offset: r13 (r13) at cfa-32
DW_CFA_advance_loc: 2 to 0000000000084958
DW_CFA_def_cfa_offset: 40
DW_CFA_offset: r12 (r12) at cfa-40
DW_CFA_advance_loc: 1 to 0000000000084959
[...]
\end{lstlisting}
\begin{itemize}
\item[\textbf{$\longrightarrow$}] \textbf{\alert{constructed} on-demand
by a \alert{Turing-complete bytecode}!}
\end{itemize}
\pause{}
\vspace{-5.5cm}
\begin{center}
\bf \fontsize{8cm}{1cm}\colorbox{white}{\alert{Slow!}}
\end{center}
\end{frame}
\begin{frame}{Why does slow matter?}
\begin{itemize}
\item{} After all, we're talking about \alert{debugging procedures} ran
by a \alert{human being} (slower than the machine).
\ldots{}or are we?
\end{itemize}
\pause{}
\begin{center}
\textbf{\Large{}No!}
\end{center}
\begin{itemize}
\pause{}\item{} Pretty much any \alert{program analysis tool}
\pause{}\item{} \alert{Profiling} with polling profilers
\pause{}\item{} \alert{Exception handling} in C++
\end{itemize}
\vspace{2em}
\begin{center}
\textbf{\Large{}Debug data is not only for debugging}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Compiling stack unwinding data ahead-of-time}
\subsection*{}
\begin{frame}{Compilation overview}
\begin{itemize}
\item Compiled to \alert{C code}
\item C code then \alert{compiled to native binary} (gcc)
\begin{itemize}
\item[$\leadsto$] gcc optimisations for free
\end{itemize}
\item Compiled as \alert{separate \texttt{.so} files}, called \ehelfs{}
\bigskip{}
\item Morally a \alert{monolithic switch} on IPs
\item Each case contains assembly that computes a \alert{row of the
table}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Example}
\begin{frame}{Compilation example: original C, DWARF}
\lstinputlisting[language=C]{src/fib7/fib7.cfde}
\end{frame}
\begin{frame}[shrink]{Compilation example: generated C}
\lstinputlisting[language=C]{src/fib7/fib7.eh_elf_basic.c}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Compilation Strategy}
\begin{frame}{Compilation choices}
\textbf{In order to keep the compiler \alert{simple} and \alert{easily
testable}, the whole DWARF5 instruction set is not supported.}
\begin{itemize}
\item Focus on \alert{x86\_64}
\item Focus on unwinding return address \\
\vspace{0.3ex}
$\leadsto$ \textit{Allows building a backtrace}
\begin{itemize}
\item \alert{suitable for perf, not for gdb}
\item Only supports \alert{unwinding registers}: \reg{rip}, \reg{rsp},
\reg{rbp}, \reg{rbx}
\item Supports the \alert{wide majority} ($> 99.9\%$) of instructions
used
\item Among \alert{4000} randomly sampled filed, only \alert{24}
containing unsupported instructions
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Interface: libunwind}
\begin{itemize}
\item \alert{libunwind}: \textit{de facto} standard library for
unwinding
\item Relies on DWARF
\bigskip{}
\item \texttt{libunwind-eh\_elf}: alternative implementation using
\ehelfs{}
\item[$\leadsto$] \alert{alternative implementation} of libunwind,
almost plug-and-play for existing projects!
\begin{itemize}
\item[$\leadsto$] It is \alert{easy} to use \ehelfs{}: just
link against the right library!
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Outlining}
\begin{frame}{Size optimisation: outlining}
\begin{itemize}
\item This \alert{works}, but \alert{takes space}: about \alert{7 times
larger in size} than regular DWARF\@.
\item DWARF optimisation strategy: \alert{alter previous row}. \\
Causes slowness: we cannot do that.
\item Remark: a lot of lines appear often.
\begin{itemize}
\item[$\leadsto$] \textbf{\emph{outline} them!}
\end{itemize}
\pause{}
\item On libc, $20\,827$ rows $\rightarrow$ $302$ outlined ($1.5\,\%$)
\item Turn the big switch into a binary search \alert{if/else tree}
\end{itemize}
\pause{}
\bigskip{}
\begin{center}
$\leadsto$ only \textbf{2.5 times bigger than DWARF}
\end{center}
\end{frame}
\begin{frame}{Example with outlining}
\lstinputlisting[language=C]{src/fib7/fib7.eh_elf_outline.c}
\end{frame}
\subsection{A word on formalization}
\begin{frame}[t]{A word on formalization}
\begin{itemize}
\item First task: \alert{writing semantics} for DWARF, written as
mapping to C code.
\item DWARF5 specification: \alert{plain English}, no proper semantics
\item Compiled code is in substance equivalent to semantics
\item What remains to prove is mostly \alert{simple or classic
optimisations}
\end{itemize}
\pause{}
\vspace{-3cm}
\begin{center}
\includegraphics[width=0.8\linewidth, angle=10]{img/dw_spec.png}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Benchmarking}
\begin{frame}{Benchmarking requirements}
\begin{enumerate}
\item Thousands of samples (single unwind: $10\,\mu{}s$)
\item Interesting enough program to unwind: nested functions, complex
FDEs
\item Mitigate caching: don't always unwind from the \emph{same} point
\item Yet be fair: don't always unwind from totally different places
\item Distribute evenly: if possible, also from within libraries
\end{enumerate}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{perf instrumentation}
\textbf{\alert{perf} is the state-of-the-art polling profiler for Linux.}
\begin{itemize}
\item{} used to get readings of the time spent in each function
\item{} works by regularly stopping the program, unwinding its stack,
then aggregating the gathered data
\end{itemize}
\pause{}\bigskip{}
\textbf{Instrumenting perf matches all the requirements!}
\begin{itemize}
\item{} \alert{Plug \ehelfs{} into perf}: use \ehelfs{} instead of
DWARF to unwind the stack
\item{} Implement \alert{unwinding performance counters} inside perf
\bigskip{}
\item{} Use perf on \alert{hackbench}, a kernel stress-test program
\begin{itemize}
\item Small program
\item Lots of calls
\item Relies on libc, libpthread
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Results}
\begin{frame}{Time performance}
\small
\centering
\begin{tabular}{l r r r r r}
\toprule
\thead{Unwinding method} & \thead{Frames \\ unwound}
& \thead{Tot.\ time \\ ($\mu s$)}
& \thead{Avg. \\ time / frame \\ ($ns$)}
& \thead{Time \\ ratio} \\
\midrule
\alert{\ehelfs{}}
& 23506 % Frames unwound
& 14837 % Total time
& 631 % Avg time
& 1
\\
\prog{libunwind}, \alert{cached}
& 27058 % Frames unwound
& 441601 % Total time
& 16320 % Avg time
& \alert{25.9}
\\
\prog{libunwind}, \alert{uncached}
& 27058 % Frames unwound
& 671292 % Total time
& 24809 % Avg time
& \alert{39.3}
\\
\bottomrule
\end{tabular}
\end{frame}
\begin{frame}{Space performance}
\begin{center}
\begin{tabular}{r r r r r r}
\toprule
\thead{Object}
& \thead{\% of binary size}
& \thead{Growth factor} \\
\midrule
libc
& 21.88 & 2.41 \\
libpthread
& 43.71 & 2.19 \\
ld
& 22.09 & 2.97 \\
hackbench
& 93.87 & 4.99 \\
\midrule
Total
& 22.81 & \alert{2.44} \\
\bottomrule
\end{tabular}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{}
\setcounter{section}{0}
\begin{frame}{What next?}
\begin{itemize}
\item Implement a release-ready, packageable, easy to use version of
perf with \ehelfs{} and submit it for inclusion
\item{} Measure \alert{C++ exceptions overhead} precisely in common
software
\item{} Implement \alert{\ehelfs{}} support for \alert{C++ runtime}
exception handling, and other systems where unwinding is a
performance bottleneck
\medskip
\item \alert{Outlining} was effective for
compactness\ldots{} Try outlining DWARF bytecode\@?
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\vspace{5mm}
\includegraphics[width=\linewidth]{img/keep_breathing}
\vspace{-1cm}
\begin{center}
\large
\begin{align*}
\textbf{Slides: } &\text{\url{https://tobast.fr/m2/slides.pdf}} \\
\textbf{Report: } &\text{\url{https://tobast.fr/m2/report.pdf}}
\end{align*}
\end{center}
\end{frame}
\end{document}