\section{A frontend model for the Cortex A72} \begin{frame}{The Cortex A72} \begin{itemize} \item{} Low-power ARM CPU \item{} CPU of the Raspberry Pi 4: easily available \item{} Aarch64, NEON SIMD \medskip{} \item{} ARM CPUs not usually modeled! \item{} Backend modeled by \palmed{} \end{itemize} \end{frame} \begin{frame} \centering \includegraphics[width=0.9\textwidth]{A72_pipeline_diagram.svg} \todo{Dispatch queues} \end{frame} \begin{frame}{Manual model} \begin{itemize} \item Goal: manually craft a frontend model \item Try to follow methods that can be automated \item Propose a parametric model for future works, leaving question marks on some sections \end{itemize} \end{frame} \begin{frame}{Proposed parametric model} \vfill \centering \begin{minipage}[t][0.7\textheight][c]{0.53\textwidth} \centering Globally, \vfill \includegraphics[width=\textwidth]{parametric_model-frontend.svg} \vfill~ % I hate LaTeX \end{minipage} \hfill\vrule\hfill \begin{minipage}[t][0.7\textheight][c]{0.43\textwidth} \centering For each instruction, \vfill \includegraphics[width=\textwidth]{parametric_model-insn.svg} \vfill~ \end{minipage} \vfill In {\color{red}\textbf{red}}, parameters of the model. \end{frame} \begin{frame}{Counting \uops{}} For an instruction $i$, denote \alert{$\mucount{i}$} its number of \uops{}. \begin{itemize} \item{} For $k \in \nat$, construct (if possible) $\kerK_k$ a kernel: \begin{itemize} \item instruction $i$ + $k$ ``simple'' instructions (one \uop) \item frontend-bound: \[ \cyc{\kerK_k} = \dfrac{k + \mucount{i}}{3} \] \end{itemize} \item{} For well-chosen $k_0$, we should have \[ \cyc{\kerK_{k_0}} + \sfrac{1}{3} = \cyc{\kerK_{k_0+1}} \] \item{} Measure to verify \bigskip \item{} If so, \textbf{\[ \mucount{i} = 3 \cyc{\kerK_{k_0}} - k \]} \end{itemize} \end{frame} \begin{frame}{Evaluation: comparison to bare \palmed} \begin{itemize} \item Add a frontend to \palmed{}: \[ \cyc{\kerK}_{\text{pred.}} = \max(\texttt{palmed}(\kerK), \texttt{frontend}(\kerK)) \] \item Reuse evaluation suite of \palmed{}: SPEC CPU 2017 + Polybench \item Compare to \llvmmca{} \end{itemize} \end{frame} \begin{frame}{Results} \centering \begin{tabular}{l l c r r r r r} \toprule & & & \multirow{2}{*}{\llvmmca{}} & \multicolumn{3}{c}{\palmed{} with frontend\ldots} \\ & & & & none & linear & disp.\ queues \\ \midrule{} \multirow{3}{*}{SPEC} & Cov. & (\%) & 100.00 & \na{} & 97.21 & 97.16 \\ & Err. & (\%) & 9.0 & 20.1 & 6.2 & 4.6 \\ & $\ktau$ & (1)& 0.83 & 0.88 & 0.91 & 0.93 \\ \midrule \multirow{3}{*}{Polybench} & Cov. & (\%) & 100.00& \na{} & 99.33 & 99.33 \\ & Err. & (\%) & 13.9 & 12.6 & 8.1 & 8.0 \\ & $\ktau$ & (1)& 0.47 & 0.82 & 0.88 & 0.90 \\ \bottomrule \end{tabular} \end{frame} \begin{frame}{Limitations} \begin{itemize} \item Parts of this model were entirely manually solved (\eg{} \# of dispatch queues) \item Evaluation based on \palmed{} suite: biased \item Must be tested on other architectures! \end{itemize} \end{frame}