Notations: introduce references
This commit is contained in:
parent
d3fe719105
commit
5914a5a165
4 changed files with 49 additions and 32 deletions
|
@ -1,31 +1,47 @@
|
|||
\chapter*{Notations}
|
||||
\addcontentsline{toc}{chapter}{Notations}
|
||||
|
||||
Throughout this whole document, the following notations are used.
|
||||
Throughout this whole document, the following non-standard notations are used.
|
||||
|
||||
\begin{center}
|
||||
\begin{tabular}{c p{0.65\textwidth} p{0.15\textwidth}}
|
||||
\toprule
|
||||
\textbf{Notation} & \textbf{Meaning} & \textbf{(See also)} \\
|
||||
\midrule
|
||||
$\cyc{\kerK}$ &
|
||||
Reciprocal throughput of $\kerK$, in cycles per occurrence of $\kerK$.
|
||||
& §\ref{def:cyc_kerK} \\
|
||||
$\cycB{\kerK}$ &
|
||||
Reciprocal throughput of $\kerK$ if it was only limited by the
|
||||
$\cyc{\kerK}$
|
||||
& Reciprocal throughput of $\kerK$, in cycles per occurrence of
|
||||
$\kerK$.
|
||||
& §\ref{def:cyc_kerK} \\
|
||||
$\cycmes{\kerK}{n}$
|
||||
& Measured reciprocal throughput of $\kerK$, over $n$ iterations of
|
||||
$\kerK$. When there is no ambiguity and $n$ is sufficiently large,
|
||||
we often write $\cyc{\kerK}$ instead.
|
||||
& §\ref{def:cycmes_kerK} \\
|
||||
$\cycB{\kerK}$
|
||||
& Reciprocal throughput of $\kerK$ if it was only limited by the
|
||||
CPU's backend.
|
||||
& \qtodo{ref} \\
|
||||
$\cycF{\kerK}$ &
|
||||
Reciprocal throughput of $\kerK$ if it was only limited by the
|
||||
& §\ref{def:cycB} \\
|
||||
$\cycF{\kerK}$
|
||||
& Reciprocal throughput of $\kerK$ if it was only limited by the
|
||||
CPU's frontend.
|
||||
& \qtodo{ref} \\
|
||||
$\kerK^n$ &
|
||||
$\kerK$ repeated $n$ times.
|
||||
& §\ref{not:kerK_N} \\
|
||||
$\mucount{}i$ &
|
||||
Number of \uops{} the instruction $i$ is decoded into. This can be
|
||||
extended to a kernel: $\mucount{}\kerK$.
|
||||
& \qtodo{ref} \\
|
||||
& §\ref{def:cycF} \\
|
||||
$C(\kerK)$
|
||||
& Number of cycles of a kernel $\kerK$.
|
||||
& §\ref{def:ker_cycles} \\
|
||||
$\kerK^n$
|
||||
& $\kerK$ repeated $n$ times.
|
||||
& §\ref{not:kerK_N} \\
|
||||
$\operatorname{IPC}(\kerK)$
|
||||
& Instructions Per Cycle in the execution of the kernel $\kerK$, in
|
||||
steady state, averaged.
|
||||
& §\ref{def:ipc} \\
|
||||
$\mucount{}i$
|
||||
& Number of \uops{} the instruction $i$ is decoded into. This can
|
||||
be extended to a kernel: $\mucount{}\kerK$.
|
||||
& §\ref{def:mucount} \\
|
||||
$\tau_K$
|
||||
& Kendall's $\tau$ coefficient of correlation.
|
||||
& §\ref{ssec:palmed_eval_metrics}, \cite{kendalltau} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
|
|
|
@ -297,7 +297,7 @@ define this notion here more formally.
|
|||
of $\kerK$ concatenated $n$ times.
|
||||
\end{notation}
|
||||
|
||||
\begin{definition}[$C(\kerK)$]
|
||||
\begin{definition}[$C(\kerK)$]\label{def:ker_cycles}
|
||||
The \emph{number of cycles} of a kernel $\kerK$ is defined, \emph{in
|
||||
steady-state}, as the number of elapsed cycles from the moment the first
|
||||
instruction of $\kerK$ starts to be decoded to the moment the last
|
||||
|
@ -474,7 +474,7 @@ stead.
|
|||
|
||||
\medskip
|
||||
|
||||
\begin{definition}[Throughput of a kernel]
|
||||
\begin{definition}[Throughput of a kernel]\label{def:ipc}
|
||||
The \emph{throughput} of a kernel $\kerK$, measured in \emph{instructions
|
||||
per cycle}, or IPC, is defined as the number of instructions in $\kerK$, divided
|
||||
by the steady-state execution time of $\kerK$:
|
||||
|
@ -486,7 +486,7 @@ stead.
|
|||
In the literature or in analyzers' reports, the throughput of a kernel is often
|
||||
referred to as its \emph{IPC} (its unit).
|
||||
|
||||
\begin{notation}[Experimental measure of $\cyc{\kerK}$]
|
||||
\begin{notation}[Experimental measure of $\cyc{\kerK}$]\label{def:cycmes_kerK}
|
||||
We note $\cycmes{\kerK}{n}$ the experimental measure of $\kerK$, realized
|
||||
by:
|
||||
\begin{itemize}
|
||||
|
|
|
@ -48,7 +48,7 @@ To evaluate \palmed{}, the same kernel is run:
|
|||
The raw results are saved (as a Python \pymodule{pickle} file) for reuse and
|
||||
archival.
|
||||
|
||||
\subsection{Metrics extracted}
|
||||
\subsection{Metrics extracted}\label{ssec:palmed_eval_metrics}
|
||||
|
||||
As \palmed{} internally works with Instructions Per Cycle (IPC) metrics, and as
|
||||
all these tools are also able to provide results in IPC, the most natural
|
||||
|
|
|
@ -66,17 +66,18 @@ distinction.
|
|||
For each of these ports, we note $\basic{p}$ the basic instruction for
|
||||
port \texttt{p}; \eg{}, $\basic{Int01}$ is \lstarmasm{ADC_RD_X_RN_X_RM_X}.
|
||||
|
||||
\paragraph{Counting the micro-ops of an instruction.} There are three main
|
||||
sources of bottleneck for a kernel $\kerK$: backend, frontend and dependencies.
|
||||
When measuring the execution time with \pipedream{}, we eliminate (as far as
|
||||
possible) the dependencies, leaving us with only backend and frontend. We note
|
||||
$\cycF{\kerK}$ the execution time of $\kerK$ if it was only limited by its
|
||||
frontend, and $\cycB{\kerK}$ the execution time of $\kerK$ if it was only
|
||||
limited by its backend. If we consider a kernel $\kerK$ that is simple enough
|
||||
to exhibit a purely linear frontend behaviour ---~that is, the frontend's
|
||||
throughput is a linear function of the number of \uops{} in the kernel~---, we
|
||||
then know that either $\cyc{\kerK} = \cycF{\kerK}$ or $\cyc{\kerK} =
|
||||
\cycB{\kerK}$.
|
||||
\paragraph{Counting the micro-ops of an
|
||||
instruction.}\label{def:cycB}\label{def:cycF}\label{def:mucount} There are
|
||||
three main sources of bottleneck for a kernel $\kerK$: backend, frontend and
|
||||
dependencies. When measuring the execution time with \pipedream{}, we
|
||||
eliminate (as far as possible) the dependencies, leaving us with only backend
|
||||
and frontend. We note $\cycF{\kerK}$ the execution time of $\kerK$ if it was
|
||||
only limited by its frontend, and $\cycB{\kerK}$ the execution time of $\kerK$
|
||||
if it was only limited by its backend. If we consider a kernel $\kerK$ that is
|
||||
simple enough to exhibit a purely linear frontend behaviour ---~that is, the
|
||||
frontend's throughput is a linear function of the number of \uops{} in the
|
||||
kernel~---, we then know that either $\cyc{\kerK} = \cycF{\kerK}$ or
|
||||
$\cyc{\kerK} = \cycB{\kerK}$.
|
||||
|
||||
For a given instruction $i$ and for a certain $k \in \nat$, we then construct a
|
||||
kernel $\kerK_k$
|
||||
|
|
Loading…
Reference in a new issue