phd-defense/slides/20_foundations/main.tex

\section{Foundations}

\begin{frame}{Bird's eye view of a CPU}
    \centering
    \includegraphics[height=0.94\textheight]{cpu_big_picture.svg}
\end{frame}

\begin{frame}{Possible bottlenecks}
    \begin{columns}
        \begin{column}{0.37\textwidth}
            \begin{center}
                \includegraphics[width=\textwidth]{cpu_big_picture_truncate.svg}
            \end{center}
        \end{column}
        \hfill
        \begin{column}{0.62\textwidth}
            \begin{tightitemize}{0pt}
                \begin{itemize}
                    \item \alert{Frontend:} \uops{} not issued fast enough;
                        issuing faster would speed up computation;
                        \bigskip

                    \item \alert{Backend:} saturated execution units; adding
                        more units would speed up computation;
                        \bigskip

                    \item \alert{Dependencies:} computation is stalled waiting
                        for previous results; removing data dependencies would
                        speed up computation.
                \end{itemize}
            \end{tightitemize}
        \end{column}
    \end{columns}

    \pause{}
    \begin{center}
        \textbf{\alert{These pieces can (mostly) be modeled independently!}}
    \end{center}
\end{frame}

\begin{frame}{Dependencies and the ROB}
    \begin{itemize}
        \item Dependencies can stall execution
        \item Maybe instructions further down can be executed right now?
        \item ROB: circular buffer of \uops{}
        \item First possible instruction is issued
    \end{itemize}
\end{frame}

\begin{frame}{How do we get insights from this complex system?}
    \textbf{Hardware counters}
    \begin{itemize}
        \item Built-in hardware, counters gathered at runtime
        \item Very accurate
        \item Available data varies from model to model
        \item May not even be available at all
    \end{itemize}

    \textbf{Simulation?}
    \begin{itemize}
        \item A modern CPU is \alert{$\sim$\,100e9 transistors}: very complex
            models!
        \item Very expensive, even for manufacturers for design validation
        \item CPU design is industrial secret $\leadsto$ not available anyway
        \item \ldots{}\ie{} not feasible.
    \end{itemize}
\end{frame}

\begin{frame}{Enter code analyzers}
    \begin{itemize}
        \item Tools that predict performance of a piece of assembly code on a
            given CPU
        \item Features microarchitectural models
        \item Most often static analyzers
        \item Predict at least the \emph{reverse-throughput} $\cyc{\kerK}$ of a
            kernel $\kerK$ (cycles per iteration)
        \item May derive further useful metrics, \eg{} bottlenecks, by
            inspecting their model at will
    \end{itemize}
\end{frame}

\begin{frame}{What can be analyzed?}
    Pieces of code referred as \alert{``microkernels''}:

    \begin{itemize}
        \item body of an (assumed) infinite loop;
        \item in steady-state;
        \item straight-line code (branches assumed not taken);
        \item L1-resident (memory model is out of scope).
    \end{itemize}

    Reasonable hypotheses for the category of codes worth optimizing this way!
\end{frame}

\begin{frame}{Existing code analyzers}
    \begin{itemize}
        \item Intel \alert{\iaca{}}: proprietary, Intel CPUs only. First
            ``good'' code analyzer, now deprecated. Was (is?) widely used.
        \item \alert{\llvmmca{}}: FOSS, production-grade, many
            microarchitectures. Based on data from the \texttt{llvm} compiler.
        \item \alert{\uica{}} and \alert{\uopsinfo{}}: research, good accuracy.
            Intel CPUs.
        \item \alert{\ithemal{}}: machine-learning based.
        \item \alert{\gus{}}: instrumentation-based code analyzer (not
            static) $\leadsto$ slow. Access to mode information. Made in the
            CORSE team.
    \end{itemize}

    Except Ithemal, \alert{all} are (to some extent) based on manually-made
    models!
\end{frame}

\begin{frame}{When I started my PhD\ldots}
    \centering
    \includegraphics[height=0.9\textheight]{patate_placeholder.jpg}
\end{frame}
Init project 2024-11-19 00:12:26 +01:00			`\section{Foundations}`
Some writeup 2024-11-20 12:54:09 +01:00
			`\begin{frame}{Bird's eye view of a CPU}`
			`\centering`
			`\includegraphics[height=0.94\textheight]{cpu_big_picture.svg}`
			`\end{frame}`

			`\begin{frame}{Possible bottlenecks}`
			`\begin{columns}`
Further writeup 2024-11-20 23:08:08 +01:00			`\begin{column}{0.37\textwidth}`
Some writeup 2024-11-20 12:54:09 +01:00			`\begin{center}`
			`\includegraphics[width=\textwidth]{cpu_big_picture_truncate.svg}`
			`\end{center}`
			`\end{column}`
			`\hfill`
Further writeup 2024-11-20 23:08:08 +01:00			`\begin{column}{0.62\textwidth}`
Some writeup 2024-11-20 12:54:09 +01:00			`\begin{tightitemize}{0pt}`
			`\begin{itemize}`
Further writeup 2024-11-20 23:08:08 +01:00			`\item \alert{Frontend:} \uops{} not issued fast enough;`
Some writeup 2024-11-20 12:54:09 +01:00			`issuing faster would speed up computation;`
			`\bigskip`

			`\item \alert{Backend:} saturated execution units; adding`
			`more units would speed up computation;`
			`\bigskip`

			`\item \alert{Dependencies:} computation is stalled waiting`
			`for previous results; removing data dependencies would`
			`speed up computation.`
			`\end{itemize}`
			`\end{tightitemize}`
			`\end{column}`
			`\end{columns}`
Further writeup 2024-11-20 23:08:08 +01:00
			`\pause{}`
			`\begin{center}`
			`\textbf{\alert{These pieces can (mostly) be modeled independently!}}`
			`\end{center}`
Some writeup 2024-11-20 12:54:09 +01:00			`\end{frame}`

			`\begin{frame}{Dependencies and the ROB}`
			`\begin{itemize}`
			`\item Dependencies can stall execution`
			`\item Maybe instructions further down can be executed right now?`
			`\item ROB: circular buffer of \uops{}`
			`\item First possible instruction is issued`
			`\end{itemize}`
			`\end{frame}`

			`\begin{frame}{How do we get insights from this complex system?}`
			`\textbf{Hardware counters}`
			`\begin{itemize}`
			`\item Built-in hardware, counters gathered at runtime`
			`\item Very accurate`
			`\item Available data varies from model to model`
			`\item May not even be available at all`
			`\end{itemize}`

			`\textbf{Simulation?}`
			`\begin{itemize}`
			`\item A modern CPU is \alert{$\sim$\,100e9 transistors}: very complex`
			`models!`
			`\item Very expensive, even for manufacturers for design validation`
			`\item CPU design is industrial secret $\leadsto$ not available anyway`
			`\item \ldots{}\ie{} not feasible.`
			`\end{itemize}`
			`\end{frame}`

			`\begin{frame}{Enter code analyzers}`
			`\begin{itemize}`
			`\item Tools that predict performance of a piece of assembly code on a`
			`given CPU`
			`\item Features microarchitectural models`
			`\item Most often static analyzers`
Further writeup, start A72 2024-11-22 14:14:32 +01:00			`\item Predict at least the \emph{reverse-throughput} $\cyc{\kerK}$ of a`
			`kernel $\kerK$ (cycles per iteration)`
Some writeup 2024-11-20 12:54:09 +01:00			`\item May derive further useful metrics, \eg{} bottlenecks, by`
			`inspecting their model at will`
			`\end{itemize}`
			`\end{frame}`

			`\begin{frame}{What can be analyzed?}`
			Pieces of code referred as \alert{``microkernels''}:

			`\begin{itemize}`
			`\item body of an (assumed) infinite loop;`
			`\item in steady-state;`
			`\item straight-line code (branches assumed not taken);`
			`\item L1-resident (memory model is out of scope).`
			`\end{itemize}`

			`Reasonable hypotheses for the category of codes worth optimizing this way!`
			`\end{frame}`

			`\begin{frame}{Existing code analyzers}`
			`\begin{itemize}`
Further writeup 2024-11-20 23:08:08 +01:00			`\item Intel \alert{\iaca{}}: proprietary, Intel CPUs only. First`
			``good'' code analyzer, now deprecated. Was (is?) widely used.
Some writeup 2024-11-20 12:54:09 +01:00			`\item \alert{\llvmmca{}}: FOSS, production-grade, many`
			`microarchitectures. Based on data from the \texttt{llvm} compiler.`
			`\item \alert{\uica{}} and \alert{\uopsinfo{}}: research, good accuracy.`
Further writeup 2024-11-20 23:08:08 +01:00			`Intel CPUs.`
			`\item \alert{\ithemal{}}: machine-learning based.`
Some writeup 2024-11-20 12:54:09 +01:00			`\item \alert{\gus{}}: instrumentation-based code analyzer (not`
			`static) $\leadsto$ slow. Access to mode information. Made in the`
			`CORSE team.`
			`\end{itemize}`

			`Except Ithemal, \alert{all} are (to some extent) based on manually-made`
			`models!`
			`\end{frame}`
Further writeup 2024-11-20 23:08:08 +01:00
			`\begin{frame}{When I started my PhD\ldots}`
			`\centering`
			`\includegraphics[height=0.9\textheight]{patate_placeholder.jpg}`
			`\end{frame}`