\section{Foundations} \begin{frame}{Bird's eye view of a CPU} \centering \includegraphics[height=0.94\textheight]{cpu_big_picture.svg} \end{frame} \begin{frame}{Possible bottlenecks} \begin{columns} \begin{column}{0.37\textwidth} \begin{center} \includegraphics[width=\textwidth]{cpu_big_picture_truncate.svg} \end{center} \end{column} \hfill \begin{column}{0.62\textwidth} \begin{tightitemize}{0pt} \begin{itemize} \item \alert{Frontend:} \uops{} not issued fast enough \bigskip \item \alert{Backend:} saturated execution units \bigskip \item \alert{Dependencies:} computation is stalled waiting for previous results \end{itemize} \end{tightitemize} \end{column} \end{columns} \end{frame} %\begin{frame}{Dependencies and the ROB} % \begin{columns} % \begin{column}{0.35\textwidth} % \begin{center} % \includegraphics[width=\textwidth]{cpu_frontend.svg} % \end{center} % \end{column} % \hfill % \begin{column}{0.64\textwidth} % \begin{tightitemize}{0pt} % \begin{itemize} % \item Dependencies can stall execution % \item Maybe instructions further down can be executed right now? % \end{itemize} % \begin{center} % \textbf{\alert{$\to$ Out-of-Order CPUs}} % \end{center} % \begin{itemize} % \item ROB: circular buffer of \uops{} % \item First possible instruction is issued % \end{itemize} % \end{tightitemize} % \end{column} % \end{columns} %\end{frame} %\begin{frame}{How do we get insights from this complex system?} % \textbf{Hardware counters} % \begin{itemize} % \item Built-in hardware, counters gathered at runtime % \item Very accurate % \item Available data varies from model to model % \item May not even be available at all % \end{itemize} % % \textbf{Simulation?} % \begin{itemize} % \item A modern CPU is \alert{$\sim$\,100e9 transistors}: very complex % models! % \item Very expensive, even for manufacturers for design validation % \item CPU design is industrial secret $\leadsto$ not available anyway % \item \ldots{}\ie{} not feasible. % \end{itemize} %\end{frame} \begin{frame}{Code analyzers} \begin{itemize} \item That predict performance of a piece of assembly \item Features microarchitectural models \item Most often static analyzers \item Predict at least the \emph{reverse-throughput} $\cyc{\kerK}$ of a kernel $\kerK$ (cycles per iteration) \item May derive further useful metrics, \eg{} bottlenecks, by inspecting their model at will \end{itemize} \end{frame} \begin{frame}{What can be analyzed?} Pieces of code referred as \alert{``microkernels''}: \begin{itemize} \item body of an (assumed) infinite loop; \item in steady-state; \item straight-line code (branches assumed not taken); \item L1-resident (memory model is out of scope). \end{itemize} Reasonable hypotheses for the category of codes worth optimizing this way! \end{frame} \begin{frame}{Existing code analyzers} \begin{itemize} \item Intel \alert{\iaca{}}: proprietary, Intel CPUs only. First ``good'' code analyzer, now deprecated. Was (is?) widely used. \item \alert{\llvmmca{}}: FOSS, production-grade, many microarchitectures. Based on data from the \texttt{llvm} compiler. \item \alert{\uica{}} and \alert{\uopsinfo{}}: research, good accuracy. Intel CPUs. \item \alert{\ithemal{}}: machine-learning based. \end{itemize} \bigskip Except Ithemal, \alert{all} are (to some extent) based on manually-made models!\\ \pause{} \bigskip{} \begin{center} \textbf{\alert{Ambition:}} \alert{automated} model generation. \end{center} \end{frame} \begin{frame}{When I started my PhD\ldots} \centering %\includegraphics[height=0.9\textheight]{patate_placeholder.jpg} \includegraphics[height=0.9\textheight]{sota_potato.svg} \end{frame}