phd-defense/slides/20_foundations/main.tex
2024-12-01 21:21:08 +01:00

128 lines
4.4 KiB
TeX

\section{Foundations}
\begin{frame}{Bird's eye view of a CPU}
\centering
\includegraphics[height=0.94\textheight]{cpu_big_picture.svg}
\end{frame}
\begin{frame}{Possible bottlenecks}
\begin{columns}
\begin{column}{0.37\textwidth}
\begin{center}
\includegraphics[width=\textwidth]{cpu_big_picture_truncate.svg}
\end{center}
\end{column}
\hfill
\begin{column}{0.62\textwidth}
\begin{tightitemize}{0pt}
\begin{itemize}
\item \alert{Frontend:} \uops{} not issued fast enough
\bigskip
\item \alert{Backend:} saturated execution units
\bigskip
\item \alert{Dependencies:} computation is stalled waiting
for previous results
\end{itemize}
\end{tightitemize}
\end{column}
\end{columns}
\end{frame}
%\begin{frame}{Dependencies and the ROB}
% \begin{columns}
% \begin{column}{0.35\textwidth}
% \begin{center}
% \includegraphics[width=\textwidth]{cpu_frontend.svg}
% \end{center}
% \end{column}
% \hfill
% \begin{column}{0.64\textwidth}
% \begin{tightitemize}{0pt}
% \begin{itemize}
% \item Dependencies can stall execution
% \item Maybe instructions further down can be executed right now?
% \end{itemize}
% \begin{center}
% \textbf{\alert{$\to$ Out-of-Order CPUs}}
% \end{center}
% \begin{itemize}
% \item ROB: circular buffer of \uops{}
% \item First possible instruction is issued
% \end{itemize}
% \end{tightitemize}
% \end{column}
% \end{columns}
%\end{frame}
%\begin{frame}{How do we get insights from this complex system?}
% \textbf{Hardware counters}
% \begin{itemize}
% \item Built-in hardware, counters gathered at runtime
% \item Very accurate
% \item Available data varies from model to model
% \item May not even be available at all
% \end{itemize}
%
% \textbf{Simulation?}
% \begin{itemize}
% \item A modern CPU is \alert{$\sim$\,100e9 transistors}: very complex
% models!
% \item Very expensive, even for manufacturers for design validation
% \item CPU design is industrial secret $\leadsto$ not available anyway
% \item \ldots{}\ie{} not feasible.
% \end{itemize}
%\end{frame}
\begin{frame}{Code analyzers}
\begin{itemize}
\item That predict performance of a piece of assembly
\item Features microarchitectural models
\item Most often static analyzers
\item Predict at least the \emph{reverse-throughput} $\cyc{\kerK}$ of a
kernel $\kerK$ (cycles per iteration)
\item May derive further useful metrics, \eg{} bottlenecks, by
inspecting their model at will
\end{itemize}
\end{frame}
\begin{frame}{What can be analyzed?}
Pieces of code referred as \alert{``microkernels''}:
\begin{itemize}
\item body of an (assumed) infinite loop;
\item in steady-state;
\item straight-line code (branches assumed not taken);
\item L1-resident (memory model is out of scope).
\end{itemize}
Reasonable hypotheses for the category of codes worth optimizing this way!
\end{frame}
\begin{frame}{Existing code analyzers}
\begin{itemize}
\item Intel \alert{\iaca{}}: proprietary, Intel CPUs only. First
``good'' code analyzer, now deprecated. Was (is?) widely used.
\item \alert{\llvmmca{}}: FOSS, production-grade, many
microarchitectures. Based on data from the \texttt{llvm} compiler.
\item \alert{\uica{}} and \alert{\uopsinfo{}}: research, good accuracy.
Intel CPUs.
\item \alert{\ithemal{}}: machine-learning based.
\end{itemize}
\bigskip
Except Ithemal, \alert{all} are (to some extent) based on manually-made
models!\\
\pause{}
\bigskip{}
\begin{center}
\textbf{\alert{Ambition:}} \alert{automated} model generation.
\end{center}
\end{frame}
\begin{frame}{When I started my PhD\ldots}
\centering
%\includegraphics[height=0.9\textheight]{patate_placeholder.jpg}
\includegraphics[height=0.9\textheight]{sota_potato.svg}
\end{frame}