Start writeup of defs; very tentative remark/proof

2024-01-06 23:53:31 +01:00 · 2024-01-06 23:53:31 +01:00 · d25a1476ae
commit d25a1476ae
parent 61252c8450
3 changed files with 66 additions and 7 deletions
--- a/manuscrit/00_opening/90_notations.tex
+++ b/manuscrit/00_opening/90_notations.tex
@ -10,7 +10,7 @@ Throughout this whole document, the following notations are used.
        \midrule
        $\cyc{\kerK}$ &
            Reciprocal throughput of $\kerK$, in cycles per occurrence of $\kerK$.
-                      & \qtodo{ref} \\
+                      & §\ref{def:cyc_kerK} \\
        $\cycB{\kerK}$ &
            Reciprocal throughput of $\kerK$ if it was only limited by the
            CPU's backend.
@ -19,6 +19,9 @@ Throughout this whole document, the following notations are used.
            Reciprocal throughput of $\kerK$ if it was only limited by the
            CPU's frontend.
                      & \qtodo{ref} \\
+        $\kerK^n$ &
+            $\kerK$ repeated $n$ times.
+                      & §\ref{not:kerK_N} \\
        $\mucount{}i$ &
            Number of \uops{} the instruction $i$ is decoded into. This can be
            extended to a kernel: $\mucount{}\kerK$.
--- a/manuscrit/20_foundations/20_code_analyzers.tex
+++ b/manuscrit/20_foundations/20_code_analyzers.tex
@ -174,17 +174,24 @@ $\cyc{\kerK}$ as the \emph{reciprocal throughput} of $\kerK$, that is, how many
 cycles $\kerK$ will require to complete its execution in steady-state. We
 define this notion here more formally.

-\begin{definition}[Reciprocal throughput of a kernel]
+\begin{notation}[$\kerK^n$]\label{not:kerK_N}
+    Given a kernel $\kerK$ and a positive integer $n \in \nat^*$, we note
+    $\kerK^n$ the kernel $\kerK$ repeated $n$ times, that is, the instructions
+    of $\kerK$ concatenated $n$ times.
+\end{notation}
+
+\begin{definition}[Reciprocal throughput of a kernel]\label{def:cyc_kerK}
    The \emph{reciprocal throughput} of a kernel $\kerK$, noted $\cyc{\kerK}$
    and measured in \emph{cycles per iteration}, is also called the
    steady-state execution time of a kernel.

-    It is defined as the
-    number of cycles, \emph{in steady-state}, from the moment the first
-    instruction of the kernel starts to be decoded to the moment the last
-    instruction of the kernel is issued.
+    Let us note $C(\kerK)$ the number of cycles, \emph{in steady-state}, from the
+    moment the first instruction of $\kerK$ starts to be decoded to the
+    moment the last instruction of $\kerK$ is issued.

-    This number may not be an integer if, \eg{}, \todo{}
+    We then define \[
+        \cyc{\kerK} = \min_{n \in \nat^*} \left( \dfrac{C(\kerK^n)}{n} \right)
+    \]
 \end{definition}

 Due to the pipelined nature of execution units, this means that the same
@ -192,12 +199,56 @@ instruction of each iteration of $\kerK$ will be retired ---~\ie{} yield its
 result~--- every steady-state execution time. For this reason, the execution
 time is measured until the last instruction is issued, not retired.

+We define this as the minimum over concatenated kernels because subsequent
+kernel iterations may ``share'' a cycle.
+
+\begin{example}
+    Let $\kerK$ be a kernel of three instructions, and assume that a given processor can only
+issue two instructions per cycle, but has no other bottleneck for $\kerK$.
+Then, $C(\kerK) = 2$, as three
+instructions cannot be issued in a single cycle; yet $C(\kerK^2) = 3$, as six
+instructions can be issued in only three cycles. Thus, in this case,
+$\cyc{\kerK} = 1.5$.
+\end{example}
+
+\begin{remark}
+    Although we define $\cyc{\kerK}$ as the minimum over $\nat^*$, only so many
+    kernels may be aggregated until we find the minimum.
+
+    Indeed, as the number of resources that can be shared between instructions
+    in a processor is finite (and relatively small, usually on the order of
+    magnitude of 10), and their number of possible states is also finite (and
+    also small), the total number of possible states of a processor at the end
+    of a kernel iteration cannot be higher than the combination of those states
+    ---~and is usually way smaller, given that only a portion of those
+    resources are used by a kernel.
+
+    Thus, by the pigeon-hole principle, and as each state depends only on the
+    previous one, the states visited by $\left(C(\kerK^n)\right)_{n \in
+    \nat^*}$ are periodic of period $p$. Take $r_0 \in \nat$ realizing
+    $\min_{0 < r \leq p}\left(\sfrac{C(\kerK^r)}{r}\right)$. As we are by hypothesis in
+    steady-state already, we have for any $n \in \nat^*$ such that $n = kp+r$,
+    $0 < r \leq p$, $k, r \in \nat$,
+    \begin{align*}
+        C(\kerK^n) &= k \cdot C(\kerK^p) + C(\kerK^r) \\
+            &\geq k \cdot C(\kerK^{r_0}) + C(\kerK^{r_0}) \\
+            &\geq (k+1) \cdot C(\kerK^{r_0}) \\
+        \implies \dfrac{C(\kerK^n)}{n} &\geq k \cdot \dfrac{C(\kerK^p)}{n} +
+        \dfrac{C(\kerK^r)}{n}
+    \end{align*}
+    \todo{}
+\end{remark}
+
+\medskip
+
 Throughout this manuscript, we mostly use reciprocal throughput as a metric, as
 we find it more relevant from an optimisation point of view ---~an opinion we
 detail in \autoref{chap:CesASMe}. However, the
 \emph{throughput} of a kernel is most widely used in the literature in its
 stead.

+\medskip
+
 \begin{definition}[Throughput of a kernel]
    The \emph{throughput} of a kernel $\kerK$, measured in \emph{instructions
    per cycle}, or IPC, is defined as the number of instructions in $\kerK$, divided
--- a/manuscrit/include/leftrules.sty
+++ b/manuscrit/include/leftrules.sty
@ -39,3 +39,8 @@
 \surroundwithmdframed[linewidth=1.5pt,
 	linecolor=LimeGreen,
 	bottomline=false,topline=false,rightline=false]{example}
+
+\newenvironment{remark}[1][]{\linedenvTop{Remark}{#1}\vspace{-0.8em}}{\linedenvBot}
+\surroundwithmdframed[linewidth=1.5pt,
+	linecolor=ForestGreen,
+	bottomline=false,topline=false,rightline=false]{remark}