% vim: spell spelllang=en \documentclass[11pt,xcolor={usenames,dvipsnames}]{beamer} \usetheme{Warsaw} \usepackage[utf8]{inputenc} \usepackage[english]{babel} \usepackage[T1]{fontenc} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{amssymb} \usepackage{booktabs} \usepackage{makecell} \usepackage{ifthen} \usepackage{colortbl} \usepackage{tabularx} \usepackage{pifont} \usepackage{multirow} \usepackage{texlib/my_listings} \usepackage{texlib/specific} \usepackage{texlib/common} \usepackage{texlib/todo} \usepackage{inconsolata} \lstset{basicstyle=\footnotesize\ttfamily} \renewcommand\theadalign{c} \renewcommand\theadfont{\scriptsize\bfseries} \setbeamertemplate{navigation symbols}{} \setbeamertemplate{headline}{} \newcommand{\thenalert}[1]{\only<1>{#1}\only<2>{\alert{#1}}} \newcommand{\slidecountline}{ \ifthenelse{\theframenumber = 0} {} {\insertframenumber/\inserttotalframenumber}} \newcommand{\sectionline}{ \ifthenelse{\thesection = 0} {} {\Roman{section}~-- \insertsection}} \newcommand{\cmark}{\color{OliveGreen}\ding{52}} \newcommand{\xmark}{\color{BrickRed}\ding{56}} \AtBeginSection{ \begin{frame} \vfill \centering \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} \usebeamerfont{title}\insertsectionhead\par% \end{beamercolorbox} \vfill \end{frame} } \lstdefinelanguage{gdb}{ morekeywords={gdb}, sensitive=false, } \newcolumntype{b}{X} \newcolumntype{s}{>{\hsize=.43\hsize}X} \newcommand{\lstinl} {\lstinline[language=C, keepspaces=true, basicstyle=\ttfamily]} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \title[\sectionline] {Reliable and Fast DWARF-based Stack Unwinding} \author[\slidecountline]{\textbf{Théophile Bastian},\\ \textbf{Stephen Kell}, \\ \textbf{Francesco Zappa Nardelli}} \date{} %\subject{} %\logo{} \institute{ENS Paris, University of Kent, Inria} \begin{document} \begin{frame} \addtocounter{framenumber}{-1} \titlepage{} \vspace{-2em} \begin{center} \begin{align*} \text{Slides: } &\text{\todo{add URL for this PDF}} \\ \end{align*} \end{center} \end{frame} \begin{frame}{~} \addtocounter{framenumber}{-1} \tableofcontents[hideallsubsections] \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{DWARF and stack unwinding data} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Introduction} \begin{frame}[fragile]{We often use stack unwinding!} \begin{columns}[c] \begin{column}{0.70\textwidth} \begin{lstlisting}[language=gdb, numbers=none, escapechar=|] Program received signal SIGSEGV. 0x54625 in fct_b at segfault.c:5 5 printf("%l\n", *b); |\pause| (gdb) backtrace #0 0x54625 in fct_b at segfault.c:5 #1 0x54663 in fct_a at segfault.c:10 #2 0x54674 in main at segfault.c:14 |\pause| (gdb) frame 1 #1 0x54663 in fct_a at segfault.c:10 10 fct_b((int*) a); |\pause| (gdb) print a $1 = 84 \end{lstlisting} \vspace{-1em} \pause{} \begin{center} \textbf{\Large How does it work?!} \end{center} \end{column} \begin{column}{0.35\textwidth} \pause{} \includegraphics[width=0.95\linewidth]{img/call_stack} \end{column} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Stack frames and unwinding} \begin{frame}{Call stack and registers} \begin{columns}[c] \begin{column}{0.55\textwidth} \begin{center} \large\bf How do we get the grandparent RA\@? \medskip Isn't it as trivial as \texttt{pop()}? \vspace{2em} \onslide<2>{We only have \reg{rsp} and \reg{rip}.} \end{center} \end{column} \begin{column}{0.45\textwidth} \includegraphics[width=0.95\linewidth]{img/call_stack} \end{column} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{DWARF tables} \newcolumntype{a}{>{\columncolor{RedOrange}}l} \begin{frame}{DWARF unwinding data} \vspace{2em} \tt \footnotesize \begin{tabular}{ >{\columncolor{YellowGreen}}l >{\columncolor{Thistle}}l l l l l l l >{\columncolor{Apricot}}l} ~LOC & CFA & rbx & rbp & r12 & r13 & r14 & r15 & ra \\ 0084950 & rsp+8 & u & u & u & u & u & u & c-8 \\ 0084952 & rsp+16 & u & u & u & u & u & c-16 & c-8 \\ 0084954 & rsp+24 & u & u & u & u & c-24 & c-16 & c-8 \\ 0084956 & rsp+32 & u & u & u & c-32 & c-24 & c-16 & c-8 \\ 0084958 & rsp+40 & u & u & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084959 & rsp+48 & u & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ \rowcolor{Aquamarine} 008495a & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084962 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a19 & rsp+56 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a1d & rsp+48 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a1e & rsp+40 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a20 & rsp+32 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a22 & rsp+24 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a24 & rsp+16 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a26 & rsp+8 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ 0084a30 & rsp+64 & c-56 & c-48 & c-40 & c-32 & c-24 & c-16 & c-8 \\ \end{tabular} \pause{} \vspace{-3cm} \hfill\includegraphics[height=3cm, angle=45, origin=c]{img/dwarf_logo} \hspace{-1cm} \end{frame} \begin{frame}[t, fragile]{The real DWARF} \begin{lstlisting}[numbers=none, language=] 00009b30 48 009b34 FDE cie=0000 pc=0084950..0084b37 DW_CFA_advance_loc: 2 to 0000000000084952 DW_CFA_def_cfa_offset: 16 DW_CFA_offset: r15 (r15) at cfa-16 DW_CFA_advance_loc: 2 to 0000000000084954 DW_CFA_def_cfa_offset: 24 DW_CFA_offset: r14 (r14) at cfa-24 DW_CFA_advance_loc: 2 to 0000000000084956 DW_CFA_def_cfa_offset: 32 DW_CFA_offset: r13 (r13) at cfa-32 DW_CFA_advance_loc: 2 to 0000000000084958 DW_CFA_def_cfa_offset: 40 DW_CFA_offset: r12 (r12) at cfa-40 DW_CFA_advance_loc: 1 to 0000000000084959 [...] \end{lstlisting} \begin{itemize} \item[\textbf{$\longrightarrow$}] \textbf{\alert{constructed} on-demand by a \alert{Turing-complete bytecode}!} \end{itemize} \pause{} \vspace{-6.5cm} \begin{center} \bf \fontsize{8cm}{1cm} \colorbox{white}{\alert{Complex}} \\ \colorbox{white}{\alert{\& slow!}} \end{center} \end{frame} \begin{frame}{Why does slow matter?} \begin{itemize} \item{} After all, we're talking about \alert{debugging procedures} ran by a \alert{human being} (slower than the machine). \ldots{}or are we? \end{itemize} \pause{} \begin{center} \textbf{\Large{}No!} \end{center} \begin{itemize} \pause{}\item{} Pretty much any \alert{program analysis tool} \pause{}\item{} \alert{Profiling} with polling profilers \pause{}\item{} \alert{Exception handling} in C++ \end{itemize} \vspace{2em} \begin{center} \textbf{\Large{}Debug data is not only for debugging} \end{center} \vspace{1em} $\leadsto$ we might want \alert{an alternative time/space trade-off} \end{frame} \newcommand{\LinusMailOne}{ ``Sorry, but last time was too f\dots painful. The whole (and only) point of unwinders is to make debugging easy when a bug occurs. But \alert{the dwarf unwinder had bugs} itself, or \alert{our dwarf information had bugs}, and in either case it actually turned several trivial bugs into a \alert{total undebuggable hell}.'' } \newcommand{\LinusMailTwo}{ ``If you can \alert{mathematically prove that the unwinder is correct} — even in the presence of bogus and actively incorrect unwinding information — and never ever follows a bad pointer, \alert{I’ll reconsider}.'' } \newcommand{\LinusSource}{ \hfill ---~Linus Torvalds, Kernel mailing list, 2012 } \begin{frame}{A debugging hell: Linux kernel} \LinusMailOne{} \only<1-2>{ \vspace{1em} \LinusSource{} } \vspace{1em} \only<2>{ \begin{center} \Large\bf \alert{This is where we still are!} \end{center} } \only<3>{ \LinusMailTwo{} \vspace{1em} \LinusSource{} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Unwinding data as an abstract state} \newcommand{\tblrowval}[4]{#1 & #2 & \only<2->{#3} & \only<2->{#4} \\} \newcommand{\blknote}[1] {\begin{block}{} \centering\large #1 \end{block}} \newcommand{\blklnote}[1] {\begin{block}{} \large #1 \end{block}} \newcommand{\tblhl}{\rowcolor{Tan}} \begin{frame}{Working on an example} \newcommand{\firsttblrows}{ \tblrowval{\hspace{-2ex}<{\bf foo}>:}{}{\textbf{CFA}}{\textbf{ra}} \rowonly<3>{\tblhl{}} \tblrowval{push}{\%r15}{rsp+8}{c-8} \rowonly<4>{\tblhl{}} \tblrowval{push}{\%r14}{rsp+16}{c-8} \rowonly<5>{\tblhl{}} \tblrowval{mov}{\$0x3,\%eax}{rsp+24}{c-8} \rowonly<6>{\tblhl{}} \tblrowval{push}{\%r13}{rsp+24}{c-8} \tblrowval{push}{\%r12}{rsp+32}{c-8} \tblrowval{push}{\%rbp}{rsp+40}{c-8} \tblrowval{push}{\%rbx}{rsp+48}{c-8} \tblrowval{sub}{\$0x68,\%rsp}{rsp+56}{c-8} } \only<-8>{ \begin{table} \ttfamily\large \begin{tabularx}{0.9\linewidth}{ l b >{\columncolor{SkyBlue}}s >{\columncolor{SkyBlue}}s } \firsttblrows{}% \tblrowval{add}{\$0x68,\%rsp}{rsp+160}{c-8} \tblrowval{pop}{\%rbx}{rsp+56}{c-8} \tblrowval{pop}{\%rbp}{rsp+48}{c-8} \end{tabularx} \end{table} \blknote{ \centering \begin{overlayarea}{0.9\textwidth}{4.8ex} \only<3>{Upon function call, \alert{ra = *(\reg{rsp})} (ABI)} \only<4>{\texttt{push} decreases \reg{rsp} by 8: % \alert{ra = *(\reg{rsp} + 8)}} \only<5>{and again: % \alert{ra = *(\reg{rsp} + 16)}} \only<6>{This \texttt{mov} leaves \reg{rsp} untouched: % \alert{ra = *(\reg{rsp} + 16)}} \only<7>{The unwinding table can actually be seen as\\ an \alert{abstract interpretation} of the code\ldots} \only<8>{\ldots and thus, for a given run, be \alert{re-computed from scratch}} \end{overlayarea} } } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Unwinding data synthesis from binaries} \begin{frame}{Why would synthesis be useful?} \begin{itemize} \item As said earlier, \alert{DWARF is complex} \item Some compilers \alert{do not generate it}: hard to \alert{debug} \& \alert{profile}. \item Think of \alert{JIT-compiled assembly} (eg. JVM) \item \ldots{}or even \alert{hand-written inlined assembly}! \begin{itemize} \item Painful enough to write for not bothering with DWARF \item May not even be known by the programmer, breaks gdb \item May be wrong (remember Linus!) \end{itemize} \end{itemize} \end{frame} \begin{frame}{What have we got so far?} We now want to \alert{synthesize unwinding data}. That means \alert{forgetting the blue part of the previous schemes}. \begin{itemize} \item Upon entering a function, we know (ABI) \[ \cfa = \reg{rsp} - 8 \qquad \ra = \cfa + 8 \] \item For each instruction, we know \alert{how it changes \cfa}. \item We assume \alert{\ra{} constant wrt. \cfa}. \begin{itemize} \item[$\leadsto$] only \cfa{} tracking matters (for unwinding) \end{itemize} \item We had a working strategy for a \alert{linear execution} \item We still have to handle \begin{itemize} \item \alert{\cfa{} expression} \item \alert{control flow graph} \end{itemize} \end{itemize} \end{frame} \begin{frame}{\cfa{} expression} Two possibilities: \begin{itemize} \item Either we track \cfa{} wrt. \reg{rsp} \begin{itemize} \item and update it after each instruction if needed \end{itemize} \item Or \reg{rbp} is used as base pointer: easy \end{itemize} \end{frame} \begin{frame}{Control flow graph} \begin{columns}[c] \column{0.4\textwidth} \lstinputlisting[language=C]{src/cfg/cfg.c} \column{0.30\textwidth} \begin{figure} \centering \includegraphics[width=\textwidth]{src/cfg/cfg.png} \end{figure} \end{columns} \begin{itemize} \item \alert{Upon split} (eg. \texttt{X})\alert{:} nothing special, propagate end state of X to child nodes A and B \item \alert{Upon join} (eg. \texttt{while\_end})\alert{:} check consistency of both input states \begin{itemize} \item If tricky, \texttt{gcc} will have used \reg{rbp}, even with \texttt{-fomit-frame-pointer}. \end{itemize} \end{itemize} \end{frame} \begin{frame}{} \vfill \centering \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} \Large\bf Demo time! \end{beamercolorbox} \vfill \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Unwinding data compilation} \subsection{Compilation ahead-of-time} \begin{frame}{Compilation overview} \begin{itemize} \item Compiled to \alert{C code} \item C code then \alert{compiled to native binary} (gcc) \begin{itemize} \item[$\leadsto$] gcc optimisations for free \end{itemize} \item Compiled as \alert{separate \texttt{.so} files}, called \ehelfs{} \bigskip{} \item Morally a \alert{monolithic switch} on IPs \item Each case contains assembly that computes a \alert{row of the table} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[shrink]{Compilation example: generated C} \lstinputlisting[language=C]{src/fib7/fib7.eh_elf_basic.c} \pause{} \vspace{1em} \begin{center} The real code is optimised, but boils down to this. \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{Mostly plug-and-play: libunwind interface} \begin{itemize} \item \alert{libunwind}: \textit{de facto} standard library for unwinding \item Relies on DWARF \bigskip{} \item \texttt{libunwind-eh\_elf}: alternative implementation using \ehelfs{} \item[$\leadsto$] almost \alert{``relink-and-play''} for existing projects! \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Results} \begin{frame}{Time performance} \begin{columns} \begin{column}{1.1\textwidth} \begin{table}[h] \centering \begin{tabular}{l l r r r r r} \toprule & \thead{Unwinding method} & \thead{Frames \\ unwound} & \thead{Tot. time \\ ($\mu s$)} & \thead{Avg. \\ time / frame \\ ($ns$)} & \thead{Time ratio} \\ \midrule \midrule \multirow{2}{*}{\rotatebox{90}{\textbf{\prog{Gzip}}~~}} &\alert{\ehelfs{}} & 331523 % Frames unwound & 25930 % Total time & 78 % Avg time & 1 \\ & \prog{libunwind}, \alert{cached} & 331523 % Frames unwound & 403292 % Total time & 1217 % Avg time & \alert{15.6} \\ &\prog{libunwind}, \alert{uncached} & 331523 % Frames unwound & 2197296 % Total time & 6635 % Avg time & \alert{84.7} \\ \midrule \multirow{2}{*}{\rotatebox{90}{\textbf{\prog{hackbench}}}} & \alert{\ehelfs{}} & 152297 % Frames unwound & 12941 % Total time & 84 % Avg time & 1 \\ & \prog{libunwind}, \alert{cached} & 152297 % Frames unwound & 316907 % Total time & 2076 % Avg time & \alert{24.6} \\ & \prog{libunwind}, \alert{uncached} & 152297 % Frames unwound & 982697 % Total time & 6439 % Avg time & \alert{76.3}\vspace{0.8em} \\ \bottomrule \end{tabular} \end{table} \end{column} \end{columns} \end{frame} \begin{frame}{Space overhead} \begin{table}[h] \centering \begin{tabular}{l r r r r} \toprule \thead{Shared object} & \thead{Original \\ \lstinline{.eh\_frame}} & \thead{Generated \\ \lstinline{eh_elf} \lstinline{.text}} & \thead{\% of original \\ program size} & \thead{Growth \\ factor} \\ \midrule find & 21.3 KiB & 68.3 KiB & 46.63 & 3.21 \\ \hfill + libs & 196.6 KiB & 577.2 KiB & 19.75 & 2.94 \\ \hline python3.7 & 160.0 B & 1.4 KiB & 355.98 & 8.33 \\ \hfill + libs & 449.0 KiB & 1.1 MiB & 23.77 & 2.61 \\ \hline gzip & 5.1 KiB & 10.9 KiB & 16.48 & 2.13 \\ \hfill + libs & 143.5 KiB & 413.1 KiB & 24.96 & 2.88 \\ \hline hackbench & 568.0 B & 3.2 KiB & 107.99 & 5.74 \\ \hfill + libs & 150.4 KiB & 439.4 KiB & 26.60 & 2.92 \\ \hline sqlite & 121.7 KiB & 382.8 KiB & 34.68 & 3.14 \\ \hfill + libs & 376.2 KiB & 1.1 MiB & 25.32 & 3.00 \\ \bottomrule \end{tabular} \end{table} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section*{Conclusion} \setcounter{section}{0} \begin{frame}{A fragment of our article} The original article \textbf{Reliable and Fast DWARF-based Stack Unwinding} contains \vspace{1em} \begin{itemize} \item{} DWARF unwinding tables validation; \item{} DWARF unwinding tables synthesis; \item{} DWARF-based unwinding speedup. \end{itemize} \vspace{1em} \begin{center} Come and chat if interested! \texttt{:)} \end{center} \end{frame} \end{document}