diff --git a/slides/10_popularization/main.tex b/slides/10_popularization/main.tex index f0badc4..0c029b5 100644 --- a/slides/10_popularization/main.tex +++ b/slides/10_popularization/main.tex @@ -4,7 +4,7 @@ \centering \includegraphics[width=\textwidth]{fugaku.jpg} - \emph{Le supercalculateur \emph{Fugaku} ---~© Riken} + \emph{Le supercalculateur \emph{Fugaku} ---~© RIKEN} \end{frame} \begin{frame}[c]{} @@ -18,3 +18,58 @@ \includegraphics[height=0.9\textheight]{serveur_supermicro.png}\\ \emph{Un serveur ---~© Supermicro} \end{frame} + +\begin{frame} + \centering + %\includegraphics[width=\textwidth]{fugaku.jpg} + + \emph{Un processeur} +\end{frame} + +\begin{frame}{Quelques ordres de grandeur} + + \begin{columns} + \begin{column}{0.32\textwidth} + \begin{block}{Nombres} + \begin{itemize} + \item{} \emph{Fugaku}~: 158,976 CPUs + \end{itemize} + \end{block} + \end{column}\hfill + \begin{column}{0.32\textwidth} + \begin{block}{Coût} + \begin{itemize} + \item{} un processeur~: $\sim$~100--1\,000\,€ + \item{} un serveur~: $\sim$~10\,000--50\,000\,€ + \item{} \textit{Fugaku}~: 1 milliard~\$ + \end{itemize} + \end{block} + \end{column}\hfill + \begin{column}{0.32\textwidth} + \begin{block}{Consommation} + \begin{itemize} + \item{} un serveur~: $\sim$~500\,W + \item{} \textit{Fugaku}~: 30--40\,MW ($\sim$~5\,\% d'un + réacteur nucléaire) + \end{itemize} + \end{block} + \end{column} + \end{columns} + + \hfill + \begin{center} + \textbf{$\rightarrow$ gagner quelques \% de performance, c'est très + rentable~!} + \end{center} +\end{frame} + +\begin{frame} + \todo{} + \begin{itemize} + \item De quel genre de programme on parle~? (Calcul scientifique + ---~météo, simulation d'océans, …~---, IA, \ldots) + \item Comment gagner de la perf~? + \item Les 3 bottlenecks + \item Les code analyzers + \end{itemize} +\end{frame} diff --git a/slides/20_foundations/main.tex b/slides/20_foundations/main.tex index dc16c09..043a691 100644 --- a/slides/20_foundations/main.tex +++ b/slides/20_foundations/main.tex @@ -1 +1,105 @@ \section{Foundations} + +\begin{frame}{Bird's eye view of a CPU} + \centering + \includegraphics[height=0.94\textheight]{cpu_big_picture.svg} +\end{frame} + +\begin{frame}{Possible bottlenecks} + \begin{columns} + \begin{column}{0.4\textwidth} + \begin{center} + \includegraphics[width=\textwidth]{cpu_big_picture_truncate.svg} + \end{center} + \end{column} + \hfill + \begin{column}{0.58\textwidth} + \begin{tightitemize}{0pt} + \begin{itemize} + \item \alert{Frontend:} \uops{} issued not fast enough; + issuing faster would speed up computation; + \bigskip + + \item \alert{Backend:} saturated execution units; adding + more units would speed up computation; + \bigskip + + \item \alert{Dependencies:} computation is stalled waiting + for previous results; removing data dependencies would + speed up computation. + \end{itemize} + \end{tightitemize} + \end{column} + \end{columns} +\end{frame} + +\begin{frame}{Dependencies and the ROB} + \begin{itemize} + \item Dependencies can stall execution + \item Maybe instructions further down can be executed right now? + \item ROB: circular buffer of \uops{} + \item First possible instruction is issued + \end{itemize} +\end{frame} + +\begin{frame}{How do we get insights from this complex system?} + \textbf{Hardware counters} + \begin{itemize} + \item Built-in hardware, counters gathered at runtime + \item Very accurate + \item Available data varies from model to model + \item May not even be available at all + \end{itemize} + + \textbf{Simulation?} + \begin{itemize} + \item A modern CPU is \alert{$\sim$\,100e9 transistors}: very complex + models! + \item Very expensive, even for manufacturers for design validation + \item CPU design is industrial secret $\leadsto$ not available anyway + \item \ldots{}\ie{} not feasible. + \end{itemize} +\end{frame} + +\begin{frame}{Enter code analyzers} + \begin{itemize} + \item Tools that predict performance of a piece of assembly code on a + given CPU + \item Features microarchitectural models + \item Most often static analyzers + \item May derive further useful metrics, \eg{} bottlenecks, by + inspecting their model at will + \end{itemize} +\end{frame} + +\begin{frame}{What can be analyzed?} + Pieces of code referred as \alert{``microkernels''}: + + \begin{itemize} + \item body of an (assumed) infinite loop; + \item in steady-state; + \item straight-line code (branches assumed not taken); + \item L1-resident (memory model is out of scope). + \end{itemize} + + Reasonable hypotheses for the category of codes worth optimizing this way! +\end{frame} + +\begin{frame}{Existing code analyzers} + \begin{itemize} + \item Intel \alert{\iaca{}}: proprietary and only compatible with + Intel. First ``good'' code analyzer, now deprecated. Was (is?) + widely used. + \item \alert{\llvmmca{}}: FOSS, production-grade, many + microarchitectures. Based on data from the \texttt{llvm} compiler. + \item \alert{\uica{}} and \alert{\uopsinfo{}}: research, good accuracy. + Intel. + \item \alert{\ithemal{}}: machine-learning based. Not so accurate. + \item \alert{\gus{}}: instrumentation-based code analyzer (not + static) $\leadsto$ slow. Access to mode information. Made in the + CORSE team. + \end{itemize} + + Except Ithemal, \alert{all} are (to some extent) based on manually-made + models! +\end{frame} diff --git a/slides/30_frontend/main.tex b/slides/30_frontend/main.tex new file mode 100644 index 0000000..eb775e1 --- /dev/null +++ b/slides/30_frontend/main.tex @@ -0,0 +1 @@ +\section{A frontend model for the Cortex A72} diff --git a/slides/40_cesasme/main.tex b/slides/40_cesasme/main.tex new file mode 100644 index 0000000..23eef3f --- /dev/null +++ b/slides/40_cesasme/main.tex @@ -0,0 +1 @@ +\section{\cesasme: evaluate and compare state-of-the-art code analyzers} diff --git a/slides/50_staticdeps/main.tex b/slides/50_staticdeps/main.tex new file mode 100644 index 0000000..4171483 --- /dev/null +++ b/slides/50_staticdeps/main.tex @@ -0,0 +1 @@ +\section{\staticdeps: static extraction of memory-carried dependencies} diff --git a/slides/60_a72combined/main.tex b/slides/60_a72combined/main.tex new file mode 100644 index 0000000..1c47dcf --- /dev/null +++ b/slides/60_a72combined/main.tex @@ -0,0 +1 @@ +\section{The \acombined{} model: wrapping it all up} diff --git a/slides/assets/imgs/20_foundations/cpu_big_picture.svg b/slides/assets/imgs/20_foundations/cpu_big_picture.svg new file mode 100644 index 0000000..497a5bc --- /dev/null +++ b/slides/assets/imgs/20_foundations/cpu_big_picture.svg @@ -0,0 +1,964 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FRONTEND + BACKEND + + + + + + + + Decoder + + + + + + Renamer + + + + + + + + + + Instructionflow + + + Reorder buffer (ROB) + + + + Issue + + + + + Port 1 + + + + + + + Port 2 + + + + + + + Port … + + + + + + + Port n + + + + Pipelined executionunits + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + REGISTERFILE + + + + L1CACHE + + + + + + + + RETIREBUFFER + + + + diff --git a/slides/assets/imgs/20_foundations/cpu_big_picture.svg.pdf b/slides/assets/imgs/20_foundations/cpu_big_picture.svg.pdf new file mode 100644 index 0000000..b172a73 Binary files /dev/null and b/slides/assets/imgs/20_foundations/cpu_big_picture.svg.pdf differ diff --git a/slides/assets/imgs/20_foundations/cpu_big_picture_truncate.svg b/slides/assets/imgs/20_foundations/cpu_big_picture_truncate.svg new file mode 100644 index 0000000..ffb7fba --- /dev/null +++ b/slides/assets/imgs/20_foundations/cpu_big_picture_truncate.svg @@ -0,0 +1,862 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FRONTEND + BACKEND + + + + + + + + Decoder + + + + + + Renamer + + + + + + + + + + Instructionflow + + + Reorder buffer (ROB) + + + + Issue + + + + + Port 1 + + + + + + + Port 2 + + + + + + + Port … + + + + + + + Port n + + + + Pipelined executionunits + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/slides/assets/imgs/20_foundations/cpu_big_picture_truncate.svg.pdf b/slides/assets/imgs/20_foundations/cpu_big_picture_truncate.svg.pdf new file mode 100644 index 0000000..3830ae9 Binary files /dev/null and b/slides/assets/imgs/20_foundations/cpu_big_picture_truncate.svg.pdf differ diff --git a/slides/include/macros.tex b/slides/include/macros.tex index 7b711d3..77d48f2 100644 --- a/slides/include/macros.tex +++ b/slides/include/macros.tex @@ -1,8 +1,8 @@ \newcommand{\uop}{$\mu$OP} \newcommand{\uops}{\uop{}s} -\newcommand{\eg}{\textit{eg.}} -\newcommand{\ie}{\textit{ie.}} +\newcommand{\eg}{e.g.} +\newcommand{\ie}{i.e.} \newcommand{\wrt}{\textit{wrt.}} \newcommand{\kerK}{\mathcal{K}} diff --git a/slides/include/packages.tex b/slides/include/packages.tex index d0dc2ef..2a2be1c 100644 --- a/slides/include/packages.tex +++ b/slides/include/packages.tex @@ -39,3 +39,12 @@ \subincludefrom{#1}{main.tex} \resetgraphicspath{} } + +% Temporarily set leftmargini +\newlength{\STASHleftmargini} +\newenvironment{tightitemize}[1]{ + \setlength{\STASHleftmargini}{\leftmargini} + \setlength{\leftmargini}{#1} +}{ + \setlength{\leftmargini}{\STASHleftmargini} +} diff --git a/slides/main.tex b/slides/main.tex index 7b50b0c..c26c525 100644 --- a/slides/main.tex +++ b/slides/main.tex @@ -21,7 +21,7 @@ Université Grenoble Alpes, inria} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{document} - +\selectlanguage{english} \maketitle{} \begin{frame} @@ -29,7 +29,17 @@ Université Grenoble Alpes, inria} \tableofcontents[hideallsubsections] \end{frame} +\selectlanguage{french} \importchapter{10_popularization} +\selectlanguage{english} \importchapter{20_foundations} +\importchapter{30_frontend} +\importchapter{40_cesasme} +\importchapter{50_staticdeps} +\importchapter{60_a72combined} + +\begin{frame}[standout] + \Large{}Questions? +\end{frame} \end{document}