From 124e17ebf2393e229cba7e222535333e3122b509 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr> Date: Thu, 14 Sep 2023 18:45:53 +0200 Subject: [PATCH] Palmed: resource mapping --- manuscrit/00_opening/90_notations.tex | 2 +- manuscrit/30_palmed/10_resource_models.tex | 221 ++++++++++++++++++++- manuscrit/include/macros.tex | 2 +- manuscrit/include/packages.tex | 1 + manuscrit/main.tex | 3 + 5 files changed, 220 insertions(+), 9 deletions(-) diff --git a/manuscrit/00_opening/90_notations.tex b/manuscrit/00_opening/90_notations.tex index f8ef2ec..066b266 100644 --- a/manuscrit/00_opening/90_notations.tex +++ b/manuscrit/00_opening/90_notations.tex @@ -9,7 +9,7 @@ Throughout this whole document, the following notations are used. \textbf{Notation} & \textbf{Meaning} & \textbf{(See also)} \\ \midrule $\cyc{\kerK}$ & - Throughput of $\kerK$, in cycles per repetition of $\kerK$, in steady state + Reciprocal throughput of $\kerK$, in cycles per occurrence of $\kerK$. & \qtodo{ref} \\ \bottomrule \end{tabular} diff --git a/manuscrit/30_palmed/10_resource_models.tex b/manuscrit/30_palmed/10_resource_models.tex index 48b1960..2e128c1 100644 --- a/manuscrit/30_palmed/10_resource_models.tex +++ b/manuscrit/30_palmed/10_resource_models.tex @@ -1,19 +1,226 @@ \section{Resource models} +\subsection{Usual representation: tripartite disjunctive graph} + As we saw earlier in \qtodo{ref}, the behaviour of a CPU's backend can be, throughput-wise, characterized by the behaviour of its ports. Thus, a throughput model of the backend consists in a mapping of the ISA's instructions to execution ports of the backend, called a \emph{port mapping}. The mapping, however, is not direct: we also saw in \qtodo{ref} that -instructions are themselves broken down into a number of \uops{}, which all -have to be executed. Each of those \uops{} are then scheduled on one of the -compatible execution ports of the CPU. A port mapping, thus, is actually a -tripartite graph: a first layer mapping instructions to \uops{}, followed by a -second layer mapping \uops{} to ports. In -\autoref{fig:port_mapping_excerpt_skx_tri}, +instructions are themselves broken down into a number of micro-operations +(\uops{}), which all have to be executed. Each of those \uops{} are then +scheduled on one of the compatible execution ports of the CPU\@. A port +mapping, thus, is actually a tripartite graph: a first layer mapping +instructions to \uops{}, followed by a second layer mapping \uops{} to ports. +In \autoref{fig:sample_port_mapping}, we show such a port mapping for a few +x86-64 instructions on the SKL-SP microarchitecture. The \uopsinfo{} +framework~\cite{uopsinfo}, for instance, produces such a model: each +instruction's mapping is described as a string, \eg{} +\texttt{VCVTT}\footnote{The precise variant is \texttt{VCVTTSD2SI (R32, XMM)}} +is described as \texttt{1*p0+1*p01}. + \begin{figure} \centering - \includegraphics[width=\textwidth]{p016_tri.svg} + \includegraphics[width=0.65\textwidth]{p016_tri.svg} + \caption{\label{fig:sample_port_mapping}Port mapping and maximum port throughput + for a few SKL-SP instructions.} \end{figure} + +We also saw that on modern CPUs, ports and computation units are most of the +time fully-pipelined; that is, each port can execute a \uop{} each cycle, even +through actually executing a \uop{} may take multiple cycles. Thus, instruction +latencies are not needed to compute the throughput of a kernel without +dependencies in steady-state, and a port mapping is sufficient. + +As some \uops{} are compatible with multiple ports, the number of cycles +required to run one occurrence of a kernel is not trivial. An assignment, for a +given kernel, of its constitutive \uops{} to ports, is a \emph{schedule} +---~the number of cycles taken by a kernel with a fixed schedule is +well-defined. The throughput of a kernel is defined as the throughput under an +optimal schedule for this kernel. + +\begin{example}[Kernel throughputs with port mappings] + The kernel $\kerK_1 = \texttt{DIVPS} + \texttt{BSR} + \texttt{JMP}$ can + complete in one cycle: $\cyc{\kerK_1} = 1$. Indeed, according to the port + mapping in \autoref{fig:sample_resource_mapping}, each of those + instructions is decoded into a single \uop{}, each compatible with a + single, distinct port. Thus, the three instructions can be issued in + parallel in one cycle. + + The same goes for $\kerK_2 = \texttt{ADDSS} + \texttt{BSR}$, although it is + a bit less trivial. Both instructions decode to a single \uop{}. + \texttt{BSR} can only be executed by port $p_1$, while \texttt{ADDSS} can + be executed either by port $p_0$ or $p_1$: by picking $p_0$, both + instructions can be executed in a single cycle in steady state, hence + $\cyc{\kerK_2} = 1$. + + The kernel $\kerK_3 = \texttt{ADDSS} + 2\times\texttt{BSR}$, however, needs + at least two cycles to be executed: \texttt{BSR} can only be executed on + port $p_1$, which can execute at most a \uop{} per cycle. $\cyc{\kerK_3} = + 2$. + + The instruction \texttt{ADDSS} alone, however, can be executed twice per + cycle: once on $p_0$ and once on $p_1$. The kernel $\kerK_4 = + 2\times\texttt{ADDSS} + \texttt{BSR}$ can thus be executed in 1.5 cycles in + average: $\cyc{\kerK_4} = 1.5$. + + \medskip + + The following tables present an optimal schedule for each kernel + $\kerK_2, \kerK_3, \kerK_4$. Each row represents a cycle. + + \begin{minipage}[t]{0.3\textwidth} + \centering + $\kerK_2$ + \smallskip + + \begin{tabular}{c c} + \toprule + $p_0$ & $p_1$ \\ + \midrule + \texttt{ADDSS} & \texttt{BSR} \\ + \texttt{ADDSS} & \texttt{BSR} \\ + \multicolumn{2}{c}{$\vdots$}\\ + \bottomrule + \end{tabular} + \end{minipage}\hfill\begin{minipage}[t]{0.3\textwidth} + \centering + $\kerK_3$ + \smallskip + + \begin{tabular}{c c} + \toprule + $p_0$ & $p_1$ \\ + \midrule + \texttt{ADDSS} & \texttt{BSR} \\ + $\emptyset$ & \texttt{BSR} \\ + \texttt{ADDSS} & \texttt{BSR} \\ + $\emptyset$ & \texttt{BSR} \\ + \multicolumn{2}{c}{$\vdots$}\\ + \bottomrule + \end{tabular} + \end{minipage}\hfill\begin{minipage}[t]{0.3\textwidth} + \centering + $\kerK_4$ + \smallskip + + \begin{tabular}{c c} + \toprule + $p_0$ & $p_1$ \\ + \midrule + \texttt{ADDSS} & \texttt{BSR} \\ + \texttt{ADDSS} & \texttt{BSR} \\ + \texttt{ADDSS} & \texttt{ADDSS} \\ + \multicolumn{2}{c}{$\vdots$}\\ + \bottomrule + \end{tabular} + \end{minipage} +\end{example} + +Finding the throughput of the kernels presented above is easy enough, as the +kernels involve few \uops{} compatible with many ports. However, in the general +case, finding an optimal schedule becomes more complicated; in fact, it can be +expressed as a flow problem.\todo{refnec?} + +\subsection{Dual representation: conjunctive resource mapping} + +\begin{figure}[b] + \centering + + \begin{subfigure}[b]{0.65\textwidth}\centering + \includegraphics[width=\textwidth]{p016_bi.svg} + \caption{Full resource mapping}\label{fig:sample_resource_mapping} + \end{subfigure}\hfill + \begin{subfigure}[b]{0.30\textwidth}\centering + \includegraphics[width=0.9\textwidth]{p016_norm.svg} + \caption{Normalized}\label{fig:norm_sample_resource_mapping} + \end{subfigure} + \caption{Abstract resource mapping + (conjunctive form) and maximum resource throughput for a few SKL-SP +instructions.} +\end{figure} + +The method behind Palmed is based on the observation that a port mapping admits +a dual representation, where the bottom layer is not expressed as an ``or'', +but also as an ``and''. + +In this dual model, an instruction such as \texttt{ADDSS} does not use +\emph{either} $p_0$ or $p_1$, but instead uses once the combined resource +$r_{01}$, which has a throughput of 2. Instructions such as \texttt{BSR}, using +only $p_1$, are using \emph{both} $r_1$ and $r_{01}$. In +\autoref{fig:sample_resource_mapping}, we present the resource mapping +equivalent to the port mapping presented in \autoref{fig:sample_port_mapping}. +We then normalize this graph to resources with a unitary throughput by dividing +each edge's weight by its corresponding resource throughput. The normalized +mapping for \texttt{ADDSS} and \texttt{BSR} is presented in +\autoref{fig:norm_sample_resource_mapping}. + +The construction of this dual model, and its equivalence to the original, +disjunctive model is detailed in the extended version of the full article on +Palmed~\cite{palmed}. + +Finding the throughput of a kernel with this conjunctive representation does not +require the solving of an optimisation problem. The number of cycles required +by a kernel is simply the maximum load over all resources. + +\begin{example} + The throughputs of the previous kernels can be computed using the + conjunctive resource model instead. + + \begin{minipage}[t]{0.3\textwidth} + \centering + $\kerK_2$ + \smallskip + + \begin{tabular}{l r r r} + \toprule + & $r_0$ & $r_1$ & $r_{01}$ \\ + \midrule + \texttt{ADDSS} & & & $\sfrac{1}{2}$ \\ + \texttt{BSR} & & 1 & $\sfrac{1}{2}$ \\ + \midrule + Total & 0 & 1 & 1 \\ + \bottomrule + \end{tabular} + \end{minipage}\hfill\begin{minipage}[t]{0.3\textwidth} + \centering + $\kerK_3$ + \smallskip + + \begin{tabular}{l r r r} + \toprule + & $r_0$ & $r_1$ & $r_{01}$ \\ + \midrule + \texttt{ADDSS} & & & $\sfrac{1}{2}$ \\ + $2\times$\texttt{BSR} & & 2 & 1 \\ + \midrule + Total & 0 & 2 & 1.5 \\ + \bottomrule + \end{tabular} + \end{minipage}\hfill\begin{minipage}[t]{0.3\textwidth} + \centering + $\kerK_4$ + \smallskip + + \begin{tabular}{l r r r} + \toprule + & $r_0$ & $r_1$ & $r_{01}$ \\ + \midrule + $2\times$\texttt{ADDSS} & & & 1 \\ + \texttt{BSR} & & 1 & $\sfrac{1}{2}$ \\ + \midrule + Total & 0 & 1 & 1.5 \\ + \bottomrule + \end{tabular} + \end{minipage} +\end{example} + +The drawback of this conjunctive model, however, is that it generates a +theoretically combinatorial number of new resources. This, however, does not happen +in practice: a combined resource is only necessary if at least one \uop{} +is supported by this set of combined ports. On real processors, ports are not +random, but instead have a well-defined set of functions, \eg{} arithmetics, +memory access, etc. Thus, only a very limited number of combined resources are +necessary. diff --git a/manuscrit/include/macros.tex b/manuscrit/include/macros.tex index 55e4557..2c31232 100644 --- a/manuscrit/include/macros.tex +++ b/manuscrit/include/macros.tex @@ -1,4 +1,4 @@ -\newcommand{\uop}{micro-operation} +\newcommand{\uop}{$\mu$OP} \newcommand{\uops}{\uop{}s} \newcommand{\eg}{\textit{eg.}} diff --git a/manuscrit/include/packages.tex b/manuscrit/include/packages.tex index 87a1c87..833755f 100644 --- a/manuscrit/include/packages.tex +++ b/manuscrit/include/packages.tex @@ -7,6 +7,7 @@ \usepackage{amsmath} \usepackage{amsfonts} \usepackage{amsthm} +\usepackage{xfrac} \usepackage{csquotes} \usepackage[dvipsnames]{xcolor} \usepackage{makecell} diff --git a/manuscrit/main.tex b/manuscrit/main.tex index dca126c..d68208b 100644 --- a/manuscrit/main.tex +++ b/manuscrit/main.tex @@ -20,4 +20,7 @@ \importchapter{60_staticdeps} \importchapter{99_conclusion} +\printbibliography{} +\addcontentsline{toc}{chapter}{Bibliography} + \end{document}