2024-11-19 00:12:26 +01:00
|
|
|
\section{Foundations}
|
2024-11-20 12:54:09 +01:00
|
|
|
|
|
|
|
\begin{frame}{Bird's eye view of a CPU}
|
|
|
|
\centering
|
|
|
|
\includegraphics[height=0.94\textheight]{cpu_big_picture.svg}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
\begin{frame}{Possible bottlenecks}
|
|
|
|
\begin{columns}
|
|
|
|
\begin{column}{0.4\textwidth}
|
|
|
|
\begin{center}
|
|
|
|
\includegraphics[width=\textwidth]{cpu_big_picture_truncate.svg}
|
|
|
|
\end{center}
|
|
|
|
\end{column}
|
|
|
|
\hfill
|
|
|
|
\begin{column}{0.58\textwidth}
|
|
|
|
\begin{tightitemize}{0pt}
|
|
|
|
\begin{itemize}
|
|
|
|
\item \alert{Frontend:} \uops{} issued not fast enough;
|
|
|
|
issuing faster would speed up computation;
|
|
|
|
\bigskip
|
|
|
|
|
|
|
|
\item \alert{Backend:} saturated execution units; adding
|
|
|
|
more units would speed up computation;
|
|
|
|
\bigskip
|
|
|
|
|
|
|
|
\item \alert{Dependencies:} computation is stalled waiting
|
|
|
|
for previous results; removing data dependencies would
|
|
|
|
speed up computation.
|
|
|
|
\end{itemize}
|
|
|
|
\end{tightitemize}
|
|
|
|
\end{column}
|
|
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
\begin{frame}{Dependencies and the ROB}
|
|
|
|
\begin{itemize}
|
|
|
|
\item Dependencies can stall execution
|
|
|
|
\item Maybe instructions further down can be executed right now?
|
|
|
|
\item ROB: circular buffer of \uops{}
|
|
|
|
\item First possible instruction is issued
|
|
|
|
\end{itemize}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
\begin{frame}{How do we get insights from this complex system?}
|
|
|
|
\textbf{Hardware counters}
|
|
|
|
\begin{itemize}
|
|
|
|
\item Built-in hardware, counters gathered at runtime
|
|
|
|
\item Very accurate
|
|
|
|
\item Available data varies from model to model
|
|
|
|
\item May not even be available at all
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
\textbf{Simulation?}
|
|
|
|
\begin{itemize}
|
|
|
|
\item A modern CPU is \alert{$\sim$\,100e9 transistors}: very complex
|
|
|
|
models!
|
|
|
|
\item Very expensive, even for manufacturers for design validation
|
|
|
|
\item CPU design is industrial secret $\leadsto$ not available anyway
|
|
|
|
\item \ldots{}\ie{} not feasible.
|
|
|
|
\end{itemize}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
\begin{frame}{Enter code analyzers}
|
|
|
|
\begin{itemize}
|
|
|
|
\item Tools that predict performance of a piece of assembly code on a
|
|
|
|
given CPU
|
|
|
|
\item Features microarchitectural models
|
|
|
|
\item Most often static analyzers
|
|
|
|
\item May derive further useful metrics, \eg{} bottlenecks, by
|
|
|
|
inspecting their model at will
|
|
|
|
\end{itemize}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
\begin{frame}{What can be analyzed?}
|
|
|
|
Pieces of code referred as \alert{``microkernels''}:
|
|
|
|
|
|
|
|
\begin{itemize}
|
|
|
|
\item body of an (assumed) infinite loop;
|
|
|
|
\item in steady-state;
|
|
|
|
\item straight-line code (branches assumed not taken);
|
|
|
|
\item L1-resident (memory model is out of scope).
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
Reasonable hypotheses for the category of codes worth optimizing this way!
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
\begin{frame}{Existing code analyzers}
|
|
|
|
\begin{itemize}
|
|
|
|
\item Intel \alert{\iaca{}}: proprietary and only compatible with
|
|
|
|
Intel. First ``good'' code analyzer, now deprecated. Was (is?)
|
|
|
|
widely used.
|
|
|
|
\item \alert{\llvmmca{}}: FOSS, production-grade, many
|
|
|
|
microarchitectures. Based on data from the \texttt{llvm} compiler.
|
|
|
|
\item \alert{\uica{}} and \alert{\uopsinfo{}}: research, good accuracy.
|
|
|
|
Intel.
|
|
|
|
\item \alert{\ithemal{}}: machine-learning based. Not so accurate.
|
|
|
|
\item \alert{\gus{}}: instrumentation-based code analyzer (not
|
|
|
|
static) $\leadsto$ slow. Access to mode information. Made in the
|
|
|
|
CORSE team.
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
Except Ithemal, \alert{all} are (to some extent) based on manually-made
|
|
|
|
models!
|
|
|
|
\end{frame}
|