% vim: spell spelllang=en \documentclass[11pt,xcolor={usenames,dvipsnames}]{beamer} \usetheme{Warsaw} \usepackage[utf8]{inputenc} \usepackage[english]{babel} \usepackage[T1]{fontenc} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{amssymb} \usepackage{booktabs} \usepackage{makecell} \usepackage{ifthen} \usepackage{colortbl} \usepackage{../shared/my_listings} %\usepackage{../shared/my_hyperref} \usepackage{../shared/specific} \usepackage{../shared/common} \usepackage{../shared/todo} \usepackage{inconsolata} \lstset{basicstyle=\footnotesize\ttfamily} \renewcommand\theadalign{c} \renewcommand\theadfont{\scriptsize\bfseries} \setbeamertemplate{navigation symbols}{} \setbeamertemplate{headline}{} \newcommand{\thenalert}[1]{\only<1>{#1}\only<2>{\alert{#1}}} \newcommand{\slidecountline}{ \ifthenelse{\theframenumber = 0} {} {\insertframenumber/\inserttotalframenumber}} \newcommand{\sectionline}{ \ifthenelse{\thesection = 0} {} {\Roman{section}~-- \insertsection}} \AtBeginSection[]{ \begin{frame} \vfill \centering \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} \usebeamerfont{title}\insertsectionhead\par% \end{beamercolorbox} \vfill \end{frame} } \lstdefinelanguage{gdb}{ morekeywords={gdb}, sensitive=false, } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \author[\slidecountline]{Théophile \textsc{Bastian} \\ \small{Under supervision of Francesco Zappa Nardelli}} \title[\sectionline] {Speeding up stack unwinding by compiling DWARF debug data} \date{March\ --\ August 2018} %\subject{} %\logo{} \institute{Team PARKAS, INRIA, Paris} \begin{document} \begin{frame} \addtocounter{framenumber}{-1} \titlepage{} \vspace{-2em} \begin{center} \begin{align*} \text{Slides: } &\text{\url{https://tobast.fr/m2/slides.pdf}} \\ \text{Report: } &\text{\url{https://tobast.fr/m2/report.pdf}} \end{align*} \end{center} \end{frame} \begin{frame}{~} \addtocounter{framenumber}{-1} \tableofcontents[hideallsubsections] \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Stack unwinding data} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Introduction} \begin{frame}[fragile]{We often use stack unwinding!} \begin{columns}[c] \begin{column}{0.70\textwidth} \begin{lstlisting}[language=gdb, numbers=none, escapechar=|] Program received signal SIGSEGV. 0x54625 in fct_b at segfault.c:5 5 printf("%l\n", *b); |\pause| (gdb) backtrace #0 0x54625 in fct_b at segfault.c:5 #1 0x54663 in fct_a at segfault.c:10 #2 0x54674 in main at segfault.c:14 |\pause| (gdb) frame 1 #1 0x54663 in fct_a at segfault.c:10 10 fct_b((int*) a); |\pause| (gdb) print a $1 = 84 \end{lstlisting} \vspace{-1em} \pause{} \begin{center} \textbf{\Large How does it work?!} \end{center} \end{column} \begin{column}{0.35\textwidth} \pause{} \includegraphics[width=0.95\linewidth]{img/stack/call_stack} \end{column} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Stack frames and unwinding} \begin{frame}{Call stack and registers} \begin{columns}[c] \begin{column}{0.55\textwidth} \begin{center} \large\bf How do we get the grandparent RA\@? \medskip Isn't it as trivial as \texttt{pop()}? \vspace{2em} \only<2>{We only have \reg{rsp} and \reg{rip}.} \end{center} \end{column} \begin{column}{0.45\textwidth} \includegraphics[width=0.95\linewidth]{img/stack/call_stack} \end{column} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{DWARF tables} \newcolumntype{a}{>{\columncolor{RedOrange}}l} \begin{frame}{DWARF unwinding data} \vspace{2em} \tt \footnotesize \begin{tabular}{ >{\columncolor{YellowGreen}}l >{\columncolor{Thistle}}l l l l l l l >{\columncolor{Apricot}}l} ~LOC & CFA & rbx & rbp & r12 & r13 & r14 & r15 & ra \\ 0084950 & rsp+8 & u & u & u & u & u & u & c-8 \\ 0084952 & rsp+16 & u & u & u & u & u & c-16 & c-8 \\ 0084954 & rsp+24 & u & u & u & u & c-24 & c-16 & c-8 \\ 0084956 & rsp+32 & u & u & u & c-32 & c-24 & c-16 & c-8 \\ 0084958 & rsp+40 & u & u & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084959 & rsp+48 & u & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ \rowcolor{Aquamarine} 008495a & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084962 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a19 & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a1d & rsp+48 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a1e & rsp+40 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a20 & rsp+32 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a22 & rsp+24 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a24 & rsp+16 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a26 & rsp+8 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a30 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ \end{tabular} \pause{} \vspace{-3cm} \hfill\includegraphics[height=3cm, angle=45, origin=c]{img/dwarf_logo} \hspace{-1cm} \end{frame} \begin{frame}[t, fragile]{The real DWARF} \begin{lstlisting}[numbers=none, language=] 00009b30 48 009b34 FDE cie=0000 pc=0084950..0084b37 DW_CFA_advance_loc: 2 to 0000000000084952 DW_CFA_def_cfa_offset: 16 DW_CFA_offset: r15 (r15) at cfa-16 DW_CFA_advance_loc: 2 to 0000000000084954 DW_CFA_def_cfa_offset: 24 DW_CFA_offset: r14 (r14) at cfa-24 DW_CFA_advance_loc: 2 to 0000000000084956 DW_CFA_def_cfa_offset: 32 DW_CFA_offset: r13 (r13) at cfa-32 DW_CFA_advance_loc: 2 to 0000000000084958 DW_CFA_def_cfa_offset: 40 DW_CFA_offset: r12 (r12) at cfa-40 DW_CFA_advance_loc: 1 to 0000000000084959 [...] \end{lstlisting} \begin{itemize} \item[\textbf{$\longrightarrow$}] \textbf{\alert{constructed} on-demand by a \alert{Turing-complete bytecode}!} \end{itemize} \pause{} \vspace{-5.5cm} \begin{center} \bf \fontsize{8cm}{1cm}\colorbox{white}{\alert{Slow!}} \end{center} \end{frame} \begin{frame}{Why does slow matter?} \begin{itemize} \item{} After all, we're talking about \alert{debugging procedures} ran by a \alert{human being} (slower than the machine). \ldots{}or are we? \end{itemize} \pause{} \begin{center} \textbf{\Large{}No!} \end{center} \begin{itemize} \pause{}\item{} Pretty much any \alert{program analysis tool} \pause{}\item{} \alert{Profiling} with polling profilers \pause{}\item{} \alert{Exception handling} in C++ \end{itemize} \vspace{2em} \begin{center} \textbf{\Large{}Debug data is not only for debugging} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Compiling stack unwinding data ahead-of-time} \subsection*{} \begin{frame}{Compilation overview} \begin{itemize} \item Compiled to \alert{C code} \item C code then \alert{compiled to native binary} (gcc) \begin{itemize} \item[$\leadsto$] gcc optimisations for free \end{itemize} \item Compiled as \alert{separate \texttt{.so} files}, called \ehelfs{} \bigskip{} \item Morally a \alert{monolithic switch} on IPs \item Each case contains assembly that computes a \alert{row of the table} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Example} \begin{frame}{Compilation example: original C, DWARF} \lstinputlisting[language=C]{src/fib7/fib7.cfde} \end{frame} \begin{frame}[shrink]{Compilation example: generated C} \lstinputlisting[language=C]{src/fib7/fib7.eh_elf_basic.c} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Compilation Strategy} \begin{frame}{Compilation choices} \textbf{In order to keep the compiler \alert{simple} and \alert{easily testable}, the whole DWARF5 instruction set is not supported.} \begin{itemize} \item Focus on \alert{x86\_64} \item Focus on unwinding return address \\ \vspace{0.3ex} $\leadsto$ \textit{Allows building a backtrace} \begin{itemize} \item \alert{suitable for perf, not for gdb} \item Only supports \alert{unwinding registers}: \reg{rip}, \reg{rsp}, \reg{rbp}, \reg{rbx} \item Supports the \alert{wide majority} ($> 99.9\%$) of instructions used \item Among \alert{4000} randomly sampled filed, only \alert{24} containing unsupported instructions \end{itemize} \end{itemize} \end{frame} \begin{frame}{Interface: libunwind} \begin{itemize} \item \alert{libunwind}: \textit{de facto} standard library for unwinding \item Relies on DWARF \bigskip{} \item \texttt{libunwind-eh\_elf}: alternative implementation using \ehelfs{} \item[$\leadsto$] \alert{alternative implementation} of libunwind, almost plug-and-play for existing projects! \begin{itemize} \item[$\leadsto$] It is \alert{easy} to use \ehelfs{}: just link against the right library! \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Outlining} \begin{frame}{Size optimisation: outlining} \begin{itemize} \item This \alert{works}, but \alert{takes space}: about \alert{7 times larger in size} than regular DWARF\@. \item DWARF optimisation strategy: \alert{alter previous row}. \\ Causes slowness: we cannot do that. \item Remark: a lot of lines appear often. \begin{itemize} \item[$\leadsto$] \textbf{\emph{outline} them!} \end{itemize} \pause{} \item On libc, $20\,827$ rows $\rightarrow$ $302$ outlined ($1.5\,\%$) \item Turn the big switch into a binary search \alert{if/else tree} \end{itemize} \pause{} \bigskip{} \begin{center} $\leadsto$ only \textbf{2.5 times bigger than DWARF} \end{center} \end{frame} \begin{frame}{Example with outlining} \lstinputlisting[language=C]{src/fib7/fib7.eh_elf_outline.c} \end{frame} \subsection{A word on formalization} \begin{frame}[t]{A word on formalization} \begin{itemize} \item First task: \alert{writing semantics} for DWARF, written as mapping to C code. \item DWARF5 specification: \alert{plain English}, no proper semantics \item Compiled code is in substance equivalent to semantics \item What remains to prove is mostly \alert{simple or classic optimisations} \end{itemize} \pause{} \vspace{-3cm} \begin{center} \includegraphics[width=0.8\linewidth, angle=10]{img/dw_spec.png} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Benchmarking} \begin{frame}{Benchmarking requirements} \begin{enumerate} \item Thousands of samples (single unwind: $10\,\mu{}s$) \item Interesting enough program to unwind: nested functions, complex FDEs \item Mitigate caching: don't always unwind from the \emph{same} point \item Yet be fair: don't always unwind from totally different places \item Distribute evenly: if possible, also from within libraries \end{enumerate} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{perf instrumentation} \textbf{\alert{perf} is the state-of-the-art polling profiler for Linux.} \begin{itemize} \item{} used to get readings of the time spent in each function \item{} works by regularly stopping the program, unwinding its stack, then aggregating the gathered data \end{itemize} \pause{}\bigskip{} \textbf{Instrumenting perf matches all the requirements!} \begin{itemize} \item{} \alert{Plug \ehelfs{} into perf}: use \ehelfs{} instead of DWARF to unwind the stack \item{} Implement \alert{unwinding performance counters} inside perf \bigskip{} \item{} Use perf on \alert{hackbench}, a kernel stress-test program \begin{itemize} \item Small program \item Lots of calls \item Relies on libc, libpthread \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Results} \begin{frame}{Time performance} \small \centering \begin{tabular}{l r r r r r} \toprule \thead{Unwinding method} & \thead{Frames \\ unwound} & \thead{Tot.\ time \\ ($\mu s$)} & \thead{Avg. \\ time / frame \\ ($ns$)} & \thead{Time \\ ratio} \\ \midrule \alert{\ehelfs{}} & 23506 % Frames unwound & 14837 % Total time & 631 % Avg time & 1 \\ \prog{libunwind}, \alert{cached} & 27058 % Frames unwound & 441601 % Total time & 16320 % Avg time & \alert{25.9} \\ \prog{libunwind}, \alert{uncached} & 27058 % Frames unwound & 671292 % Total time & 24809 % Avg time & \alert{39.3} \\ \bottomrule \end{tabular} \end{frame} \begin{frame}{Space performance} \begin{center} \begin{tabular}{r r r r r r} \toprule \thead{Object} & \thead{\% of binary size} & \thead{Growth factor} \\ \midrule libc & 21.88 & 2.41 \\ libpthread & 43.71 & 2.19 \\ ld & 22.09 & 2.97 \\ hackbench & 93.87 & 4.99 \\ \midrule Total & 22.81 & \alert{2.44} \\ \bottomrule \end{tabular} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section*{} \setcounter{section}{0} \begin{frame}{What next?} \begin{itemize} \item Implement a release-ready, packageable, easy to use version of perf with \ehelfs{} and submit it for inclusion \item{} Measure \alert{C++ exceptions overhead} precisely in common software \item{} Implement \alert{\ehelfs{}} support for \alert{C++ runtime} exception handling, and other systems where unwinding is a performance bottleneck \medskip \item \alert{Outlining} was effective for compactness\ldots{} Try outlining DWARF bytecode\@? \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \vspace{5mm} \includegraphics[width=\linewidth]{img/keep_breathing} \vspace{-1cm} \begin{center} \large \begin{align*} \textbf{Slides: } &\text{\url{https://tobast.fr/m2/slides.pdf}} \\ \textbf{Report: } &\text{\url{https://tobast.fr/m2/report.pdf}} \end{align*} \end{center} \end{frame} \end{document}