Notations: introduce references

2024-03-28 16:11:56 +01:00 · 2024-03-28 16:11:56 +01:00 · 5914a5a165
commit 5914a5a165
parent d3fe719105
4 changed files with 49 additions and 32 deletions
--- a/manuscrit/00_opening/90_notations.tex
+++ b/manuscrit/00_opening/90_notations.tex
@ -1,31 +1,47 @@
 \chapter*{Notations}
 \addcontentsline{toc}{chapter}{Notations}

-Throughout this whole document, the following notations are used.
+Throughout this whole document, the following non-standard notations are used.

 \begin{center}
    \begin{tabular}{c p{0.65\textwidth} p{0.15\textwidth}}
        \toprule
        \textbf{Notation} & \textbf{Meaning} & \textbf{(See also)} \\
        \midrule
-        $\cyc{\kerK}$ &
-            Reciprocal throughput of $\kerK$, in cycles per occurrence of $\kerK$.
+        $\cyc{\kerK}$
+            & Reciprocal throughput of $\kerK$, in cycles per occurrence of
+              $\kerK$.
            & §\ref{def:cyc_kerK} \\
-        $\cycB{\kerK}$ &
-            Reciprocal throughput of $\kerK$ if it was only limited by the
+        $\cycmes{\kerK}{n}$
+            & Measured reciprocal throughput of $\kerK$, over $n$ iterations of
+            $\kerK$. When there is no ambiguity and $n$ is sufficiently large,
+            we often write $\cyc{\kerK}$ instead.
+            & §\ref{def:cycmes_kerK} \\
+        $\cycB{\kerK}$
+            & Reciprocal throughput of $\kerK$ if it was only limited by the
            CPU's backend.
-                      & \qtodo{ref} \\
-        $\cycF{\kerK}$ &
-            Reciprocal throughput of $\kerK$ if it was only limited by the
+            & §\ref{def:cycB} \\
+        $\cycF{\kerK}$
+            & Reciprocal throughput of $\kerK$ if it was only limited by the
            CPU's frontend.
-                      & \qtodo{ref} \\
-        $\kerK^n$ &
-            $\kerK$ repeated $n$ times.
+            & §\ref{def:cycF} \\
+        $C(\kerK)$
+            & Number of cycles of a kernel $\kerK$.
+            & §\ref{def:ker_cycles} \\
+        $\kerK^n$
+            & $\kerK$ repeated $n$ times.
            & §\ref{not:kerK_N} \\
-        $\mucount{}i$ &
-            Number of \uops{} the instruction $i$ is decoded into. This can be
-            extended to a kernel: $\mucount{}\kerK$.
-                      & \qtodo{ref} \\
+        $\operatorname{IPC}(\kerK)$
+            & Instructions Per Cycle in the execution of the kernel $\kerK$, in
+            steady state, averaged.
+            & §\ref{def:ipc} \\
+        $\mucount{}i$
+            & Number of \uops{} the instruction $i$ is decoded into. This can
+            be extended to a kernel: $\mucount{}\kerK$.
+            & §\ref{def:mucount} \\
+        $\tau_K$
+            & Kendall's $\tau$ coefficient of correlation.
+            & §\ref{ssec:palmed_eval_metrics}, \cite{kendalltau} \\
        \bottomrule
    \end{tabular}
 \end{center}
--- a/manuscrit/20_foundations/20_code_analyzers.tex
+++ b/manuscrit/20_foundations/20_code_analyzers.tex
@ -297,7 +297,7 @@ define this notion here more formally.
    of $\kerK$ concatenated $n$ times.
 \end{notation}

-\begin{definition}[$C(\kerK)$]
+\begin{definition}[$C(\kerK)$]\label{def:ker_cycles}
    The \emph{number of cycles} of a kernel $\kerK$ is defined, \emph{in
    steady-state}, as the number of elapsed cycles from the moment the first
    instruction of $\kerK$ starts to be decoded to the moment the last
@ -474,7 +474,7 @@ stead.

 \medskip

-\begin{definition}[Throughput of a kernel]
+\begin{definition}[Throughput of a kernel]\label{def:ipc}
    The \emph{throughput} of a kernel $\kerK$, measured in \emph{instructions
    per cycle}, or IPC, is defined as the number of instructions in $\kerK$, divided
    by the steady-state execution time of $\kerK$:
@ -486,7 +486,7 @@ stead.
 In the literature or in analyzers' reports, the throughput of a kernel is often
 referred to as its \emph{IPC} (its unit).

-\begin{notation}[Experimental measure of $\cyc{\kerK}$]
+\begin{notation}[Experimental measure of $\cyc{\kerK}$]\label{def:cycmes_kerK}
    We note $\cycmes{\kerK}{n}$ the experimental measure of $\kerK$, realized
    by:
    \begin{itemize}
--- a/manuscrit/30_palmed/40_palmed_results.tex
+++ b/manuscrit/30_palmed/40_palmed_results.tex
@ -48,7 +48,7 @@ To evaluate \palmed{}, the same kernel is run:
 The raw results are saved (as a Python \pymodule{pickle} file) for reuse and
 archival.

-\subsection{Metrics extracted}
+\subsection{Metrics extracted}\label{ssec:palmed_eval_metrics}

 As \palmed{} internally works with Instructions Per Cycle (IPC) metrics, and as
 all these tools are also able to provide results in IPC, the most natural
--- a/manuscrit/40_A72-frontend/30_manual_frontend.tex
+++ b/manuscrit/40_A72-frontend/30_manual_frontend.tex
@ -66,17 +66,18 @@ distinction.
 For each of these ports, we note $\basic{p}$ the basic instruction for
 port \texttt{p}; \eg{}, $\basic{Int01}$ is \lstarmasm{ADC_RD_X_RN_X_RM_X}.

-\paragraph{Counting the micro-ops of an instruction.} There are three main
-sources of bottleneck for a kernel $\kerK$: backend, frontend and dependencies.
-When measuring the execution time with \pipedream{}, we eliminate (as far as
-possible) the dependencies, leaving us with only backend and frontend. We note
-$\cycF{\kerK}$ the execution time of $\kerK$ if it was only limited by its
-frontend, and $\cycB{\kerK}$ the execution time of $\kerK$ if it was only
-limited by its backend. If we consider a kernel $\kerK$ that is simple enough
-to exhibit a purely linear frontend behaviour ---~that is, the frontend's
-throughput is a linear function of the number of \uops{} in the kernel~---, we
-then know that either $\cyc{\kerK} = \cycF{\kerK}$ or $\cyc{\kerK} =
-\cycB{\kerK}$.
+\paragraph{Counting the micro-ops of an
+instruction.}\label{def:cycB}\label{def:cycF}\label{def:mucount} There are
+three main sources of bottleneck for a kernel $\kerK$: backend, frontend and
+dependencies.  When measuring the execution time with \pipedream{}, we
+eliminate (as far as possible) the dependencies, leaving us with only backend
+and frontend. We note $\cycF{\kerK}$ the execution time of $\kerK$ if it was
+only limited by its frontend, and $\cycB{\kerK}$ the execution time of $\kerK$
+if it was only limited by its backend. If we consider a kernel $\kerK$ that is
+simple enough to exhibit a purely linear frontend behaviour ---~that is, the
+frontend's throughput is a linear function of the number of \uops{} in the
+kernel~---, we then know that either $\cyc{\kerK} = \cycF{\kerK}$ or
+$\cyc{\kerK} = \cycB{\kerK}$.

 For a given instruction $i$ and for a certain $k \in \nat$, we then construct a
 kernel $\kerK_k$