From 175cb8cef32f7aea8e4f09b631ba73302d7b4a05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Thu, 21 Sep 2023 17:48:47 +0200 Subject: [PATCH] A72: nocross model: start writeup --- .../40_A72-frontend/30_manual_frontend.tex | 44 ++++ .../timeline_front_cross-uncrossed.svg | 109 +++++++++ .../40_A72-frontend/timeline_front_cross.svg | 126 ++++++++++ .../timeline_front_nocross.svg | 225 ++++++++++++++++++ 4 files changed, 504 insertions(+) create mode 100644 manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross-uncrossed.svg create mode 100644 manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross.svg create mode 100644 manuscrit/assets/imgs/40_A72-frontend/timeline_front_nocross.svg diff --git a/manuscrit/40_A72-frontend/30_manual_frontend.tex b/manuscrit/40_A72-frontend/30_manual_frontend.tex index 4ac8ffc..ba56ccc 100644 --- a/manuscrit/40_A72-frontend/30_manual_frontend.tex +++ b/manuscrit/40_A72-frontend/30_manual_frontend.tex @@ -180,3 +180,47 @@ model mapping each supported instruction of the ISA to its \uop{} count. which is consistent. We conclude that $\mucount i = 3\cyc{\kerK_3} = 4-2 = 2$. \end{example} + + +\subsection{Bubbles in the pipeline} + +The frontend, however, does not always exhibit a purely linear behaviour. We +consider for instance the kernel $\kerK =$ \lstarmasm{ADDV_FD_H_VN_V_8H} $+ +3\times\basic{Int01}$; for the rest of this chapter, we refer to +\lstarmasm{ADDV_FD_H_VN_V_8H} as simply \lstarmasm{ADDV} when not stated +otherwise. + +Backend-wise, \texttt{ADDV} fully loads \texttt{FP1} and \texttt{FP01}, while +$\basic{Int01}$ half-loads \texttt{Int01}. The port most loaded by $\kerK$ is +thus \texttt{Int01}, with a load of $1\,\sfrac{1}{2}$. We then expect +$\cycB{\kerK} = 1\,\sfrac{1}{2}$. + +Frontend-wise, \texttt{ADDV} decomposes into two \uops{}, while $\basic{Int01}$ +decomposes into a single \uops{}; thus, $\mucount{}\kerK = 5$. We then expect +$\cycF{\kerK} = 1\,\sfrac{2}{3}$. + +As the frontend dominates the backend, we expect $\cyc{\kerK} = \cycF{\kerK} = +1\,\sfrac{2}{3}$. However, in reality, we measure $\cyc{\kerK} = 2.01 \simeq 2$ +cycles. + +\medskip{} + +From then on, we strive to find a model that could reliably predict, given a +kernel, how many cycles it requires to execute, frontend-wise, in a +steady-state. + +\subsubsection{No-cross model} + +\begin{figure} + \centering + \includegraphics[width=0.7\linewidth]{timeline_front_nocross.svg} +\end{figure} + + +On the x86-64 architectures they analyzed, \uica{}'s authors find that the +CPU's predecoder might cause an instruction's \uops{} to be postponed to the +next cycle if it is pre-decoded across a cycle boundary~\cite{uica} (ยง4.1). + +We hypothesize that the same kind of effect could postpone an instruction's +\uops{} until the next cycle if its \uops{} would cross a cycle boundary +otherwise, as illustrated in \qtodo{ref}. diff --git a/manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross-uncrossed.svg b/manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross-uncrossed.svg new file mode 100644 index 0000000..3cb8b8e --- /dev/null +++ b/manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross-uncrossed.svg @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + + + + + Frontend + + + diff --git a/manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross.svg b/manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross.svg new file mode 100644 index 0000000..5e9804a --- /dev/null +++ b/manuscrit/assets/imgs/40_A72-frontend/timeline_front_cross.svg @@ -0,0 +1,126 @@ + + + + + + + + + + + + + + + + + + + + Frontend + + + + + + diff --git a/manuscrit/assets/imgs/40_A72-frontend/timeline_front_nocross.svg b/manuscrit/assets/imgs/40_A72-frontend/timeline_front_nocross.svg new file mode 100644 index 0000000..6c39bab --- /dev/null +++ b/manuscrit/assets/imgs/40_A72-frontend/timeline_front_nocross.svg @@ -0,0 +1,225 @@ + + + + + + + + + + + + + + + + + + + + + + Frontend + + + + + + + + + + + + + + Frontend + + + + + + + +