\section{\cesasme: evaluate and compare state-of-the-art code analyzers} \begin{frame}[fragile] \begin{minipage}{0.6\textwidth} \begin{center} Matrix multiplication: \end{center} \begin{lstlisting}[language={[x86masm]Assembler}] loop: movsd (%rcx, %rax), %xmm0 mulsd %xmm1, %xmm0 addsd (%rdx, %rax), %xmm0 movsd %xmm0, (%rdx, %rax) addq $8, %rax cmpq $0x2260, %rax jne loop\end{lstlisting} \end{minipage}\hfill\vrule\hfill \begin{minipage}{0.38\textwidth} \begin{tabular}{l r} \llvmmca{}: & 1.5 cycles/iter \\ \iaca{}: & 2.0 cycles/iter \\ \ithemal{}: & 2.0 cycles/iter \\ \uica{}: & 3.0 cycles/iter \\ \end{tabular} \vspace{1em} \begin{center} \only<1>{\alert{\textbf{Which tool is correct?}}} \only<2->{\textbf{Which tool is correct?}} \end{center} \end{minipage} \vfill{} \pause{} \begin{center} \textbf{We lack:}\\ \hfill\textbf{\alert{Benchmarks}}\hfill\textbf{\alert{Context}}\hfill~ \end{center} \end{frame} \begin{frame}{Generating benchmarks} We need benchmarks\ldots \\ \vspace{1em} {\def\arraystretch{1.2} \newcommand{\litem}{\usebeamertemplate*{itemize item}\hspace{-0.5em}} \begin{tabular}{rl l} \litem{} & representative of scientific computation & \visible<2->{\alert{Polybench}}\\ \litem{} & infinite, L1-resident loops & \visible<3->{\alert{``microkernelification''} + verify} \\ \litem{} & without control flow & \visible<4->{\alert{Polybench}} \\ \litem{} & stressing diverse resources & \visible<5->{\alert{Polyhedral transformations}} \\ & & \visible<5->{+ \alert{unrolling} + \alert{compiler options}} \\ \litem{} & plenty of them & \visible<6->{\alert{Even more} of all those $\nnearrow$} \\ \end{tabular} \let\litem\undefined } \begin{center} \visible<6->{\textbf{\leadsto{} yields \alert{~3500} benchmarks}} \end{center} \end{frame} \begin{frame}{In-context baseline: lifting predictions} \begin{center} \textbf{Consider instead $\kerK$ = \alert{full kernel}, with its context\\ $\leadsto$ \alert{multiple} basic blocks} \end{center} \pause \begin{itemize} \item Measure total kernel time \textbf{in context} \item Instrument full kernel $\kerK$: for each basic block, $\operatorname{occur}(\text{bb})$ \item For each tool \begin{itemize} \item for each bb, $\operatorname{prediction}(\text{bb})$ \item \emph{lift} predictions: \[ \operatorname{prediction}(\kerK) = \sum_{\text{bb} \in \kerK} \operatorname{occur}(\text{bb}) \times \operatorname{prediction}(\text{bb}) \] \end{itemize} \end{itemize} \vfill \pause \begin{center} \textbf{Now we have a baseline.} \end{center} \end{frame} \begin{frame} \vspace{0.5cm} \begin{columns} \column{\dimexpr\paperwidth-8pt} \centering \input{overview.tex} \end{columns} \end{frame} \begin{frame}{First results (Intel Skylake on Grid5000)} \begin{columns} \column{\dimexpr\paperwidth-8pt} \centering \begin{minipage}[c]{0.27\textwidth} ~ \end{minipage} \hfill \begin{minipage}[c]{0.4\textwidth} \centering \includegraphics[width=\textwidth]{overall_analysis_boxplot.svg}\\ \end{minipage} \hfill \begin{minipage}[c]{0.27\textwidth} \centering {\small\textit{Outliers > 250\,\% trimmed}} \\ \vspace{2em} {\small\textit{Associated table in\\ supplementary material}} \end{minipage} \begin{center} \textbf{\alert{Severely worse} than previous evaluations!}\\ \textbf{\hspace{0.7cm}Harness broken?\hfill{}Harder benchmarks?\hfill{}Previously undetected weaknesses?\hspace{0.7cm}~} \end{center} \end{columns} \end{frame} \begin{frame}[fragile]{Searching for areas of improvement} \begin{itemize} \item{} Tools often wrong on the \emph{same} rows \begin{itemize} \item \llvmmca{}, \iaca{} and \uica{} share 80\,\% of their worst 30\,\% \end{itemize} \item{} Often \texttt{-O1} rows \end{itemize} \begin{center} \textbf{Crucial difference:} \end{center} \newcommand{\lsthlA}[1]{\texttt{\color[HTML]{df018a}#1}} \newcommand{\lsthlB}[1]{\texttt{\color[HTML]{d88900}#1}} \begin{minipage}[t]{0.47\textwidth} \begin{center} \textbf{{\color{red}Bad}\onslide<2->{: reduction}} \end{center} \vspace{-1em} \begin{lstlisting}[language={[ANSI]C}] for((§\lsthlA{c3}§)) tmp[(§\lsthlB{c1}§)] += A[c1][c3] * x[c3]; \end{lstlisting} \end{minipage} \hfill\vrule\hfill \begin{minipage}[t]{0.47\textwidth} \begin{center} \textbf{{\color[HTML]{008f0c}Good}\onslide<2->{: map}} \end{center} \vspace{-1em} \begin{lstlisting}[language={[ANSI]C}] for((§\lsthlA{c3}§)) A[c1][(§\lsthlA{c3}§)] += u1[c1] * v1[c3] + u2[c1] * v2[c3]; \end{lstlisting} \end{minipage} \begin{center} \onslide<3->{\alert{\textbf{Dependencies through memory!}}} \end{center} \end{frame} \begin{frame}{Pruning memory-carried dependencies (Intel Skylake on Grid5000)} \begin{columns} \column{\dimexpr\paperwidth-8pt} \centering \begin{minipage}[c]{0.24\textwidth} ~ \end{minipage} \hfill \begin{minipage}[c]{0.5\textwidth} \centering \includegraphics[width=\textwidth]{nomemdeps_boxplot.svg}\\ \end{minipage} \hfill \begin{minipage}[c]{0.24\textwidth} \centering{} {\small\textit{Outliers > 200\,\% \\ trimmed}} \end{minipage} \end{columns} \begin{center} \textbf{\alert{Closer to expected results}} \end{center} \end{frame}