From b5b0296102e6d2d3d0f451c1effe797131c8912e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr> Date: Sun, 1 Sep 2024 16:05:21 +0200 Subject: [PATCH] Proof-read chapter 5 (staticdeps) --- manuscrit/60_staticdeps/10_types_of_deps.tex | 35 ++++++++----- manuscrit/60_staticdeps/20_dynamic.tex | 20 ++++---- .../60_staticdeps/30_static_principle.tex | 50 +++++++++---------- manuscrit/60_staticdeps/35_rob_proof.tex | 26 ++++++---- manuscrit/60_staticdeps/40_staticdeps.tex | 7 +-- manuscrit/60_staticdeps/42_staticdeps_alg.tex | 16 +++--- manuscrit/60_staticdeps/50_eval.tex | 14 +++--- manuscrit/60_staticdeps/99_conclusion.tex | 6 ++- manuscrit/90_wrapping_up/main.tex | 22 ++++---- manuscrit/include/packages.tex | 1 + 10 files changed, 107 insertions(+), 90 deletions(-) diff --git a/manuscrit/60_staticdeps/10_types_of_deps.tex b/manuscrit/60_staticdeps/10_types_of_deps.tex index 713654e..7897b6c 100644 --- a/manuscrit/60_staticdeps/10_types_of_deps.tex +++ b/manuscrit/60_staticdeps/10_types_of_deps.tex @@ -20,7 +20,7 @@ down into four categories: For instance, in the kernel presented in the introduction of this chapter, the first instruction (\lstxasm{add \%rax, \%rbx}) reads its first operand, the -register \reg{rax}, and both reads and write its second operand \reg{rbx}. The +register \reg{rax}, and both reads and writes its second operand \reg{rbx}. The second \lstxasm{add} has the same behaviour. Thus, as \reg{rbx} is written at line 1, and read at line 2, there is a read-after-write dependency between the two. @@ -51,7 +51,7 @@ however, other channels. As we saw in the introduction to this chapter, as well as in the previous chapter, dependencies can also be \emph{memory-carried}, in more or less straightforward ways, such as in the examples from -\autoref{lst:mem_carried_exn}, where the last line always depend on the first. +\autoref{lst:mem_carried_exn}, where the last line always depends on the first. \begin{lstfloat}[h!] \begin{minipage}[t]{0.32\linewidth} @@ -67,8 +67,8 @@ add -8(%rbx), %rcx\end{lstlisting} \end{minipage}\hfill \begin{minipage}[t]{0.32\linewidth} \begin{lstlisting}[language={[x86masm]Assembler}] -lea 16(%rbx), %r10 add %rax, (%rbx) +lea 16(%rbx), %r10 add -16(%r10), %rcx\end{lstlisting} \end{minipage}\hfill \caption{Examples of memory-carried dependencies.}\label{lst:mem_carried_exn} @@ -94,27 +94,36 @@ with a large emphasis on memory-carried dependencies. \paragraph{Presence of loops.} The previous examples were all pieces of \emph{straight-line code} in which a dependency arose. However, many dependencies are actually \emph{loop-carried}, such as those in -\autoref{lst:loop_carried_exn}. +\autoref{lst:loop_carried_exn}. In \autoref{lst:loop_carried_exn:sumA}, line 2 +depends on the previous iteration's line 2 as \reg{r10} is read, then written +back. In \autoref{lst:loop_carried_exn:quasiFibo}, line 3 depends on line 2 of +the same iteration; but line 2 alsp depends on line 3 two iterations ago by +reading \lstxasm{-16(\%rbx, \%r10)}. \begin{lstfloat} -\begin{minipage}[t]{0.48\linewidth} - \begin{lstlisting}[language={[x86masm]Assembler}] -# Compute sum(A), %rax points to A + \centering + + \begin{sublstfloat}[b]{0.48\linewidth} + \begin{lstlisting}[language={[x86masm]Assembler}] loop: add (%rax), %r10 add $8, %rax jmp loop \end{lstlisting} -\end{minipage}\hfill -\begin{minipage}[t]{0.48\linewidth} - \begin{lstlisting}[language={[x86masm]Assembler}] -# Compute B[i] = A[i] + B[i-2] +\caption{Compute the sum of array \lstxasm{A}'s terms in \reg{r10}. \reg{rax} points to + \lstxasm{A}.}\label{lst:loop_carried_exn:sumA} + \end{sublstfloat} + \hfill + \begin{sublstfloat}[b]{0.48\linewidth} + \begin{lstlisting}[language={[x86masm]Assembler}] loop: mov -16(%rbx, %r10), (%rbx, %r10) add (%rax, %r10), (%rbx, %r10) add $8, %r10 jmp loop \end{lstlisting} -\end{minipage}\hfill -\caption{Examples of loop-carried dependencies.}\label{lst:loop_carried_exn} + \caption{Compute \lstxasm{B[i] = A[i] + B[i-2]}. \reg{rax} points to + \lstxasm{A}, \reg{rbx} points to \lstxasm{B}.}\label{lst:loop_carried_exn:quasiFibo} + \end{sublstfloat} + \caption{Examples of loop-carried dependencies.}\label{lst:loop_carried_exn} \end{lstfloat} diff --git a/manuscrit/60_staticdeps/20_dynamic.tex b/manuscrit/60_staticdeps/20_dynamic.tex index 2c43e6e..871e69f 100644 --- a/manuscrit/60_staticdeps/20_dynamic.tex +++ b/manuscrit/60_staticdeps/20_dynamic.tex @@ -5,18 +5,20 @@ the actual data dependencies occurring throughout an execution. While such analyzers are often too slow to use in practice, they can be used as a baseline to evaluate static alternatives. -Due to its complexity, \gus{} is, however, ill-suited for the implementation of -a simple experiment if one is not already familiar with its codebase. For this -reason, we implement \depsim{}, a dynamic analysis tool based on top of +As it is a complex tool performing a wide range of analyses, \gus{} is, +however, unnecessarily complex to simply serve as a baseline. For the same +reason, it is also impractically slower than a simple dynamic analysis. For +this reason, we implement \depsim{}, a dynamic analysis tool based on top of \valgrind{}, whose goal is to report dependencies encountered at runtime. \subsection{Valgrind} Most low-level developers and computer scientists know -\valgrind{}~\cite{tool:valgrind} as a memory -analysis tool, reporting bad memory accesses, memory leaks and the like. -However, this is only the \texttt{memcheck} tool built into \valgrind{}, while the -whole program is actually a binary instrumentation framework. +\valgrind{}~\cite{tool:valgrind} as a memory analysis tool, reporting bad +memory accesses, memory leaks and the like. However, this is only a small part +of \valgrind{} ---~the \texttt{memcheck} tool. The whole program is actually a +binary instrumentation framework, upon which the famous \texttt{memcheck} is +built. \valgrind{} supports a wide variety of platforms, including x86-64 and ARM. However, at the time of the writing, it supports AVX2, but does not yet @@ -41,7 +43,7 @@ for some assembly code, independently of the Valgrind framework. \subsection{Depsim}\label{ssec:depsim} -The tool we write to extract runtime-gathered dependencies, \depsim{}, is +The tool we wrote to extract runtime-gathered dependencies, \depsim{}, is able to extract dependencies through both registers, memory and temporary variables ---~in its intermediate representation, Valgrind keeps some values assigned to temporary variables in static single-assignment (SSA) form. @@ -92,5 +94,5 @@ avoid excessive instrumentation slowdown. We further annotate every write to the shadow memory with the timestamp at which it occurred. Whenever a dependency should be added, we first check that the dependency has not expired ---~that is, that it is not older than a given -threshold. This threshold is tunable for each run --~and may be set to infinity +threshold. This threshold is tunable for each run ---~and may be set to infinity to keep every dependency. diff --git a/manuscrit/60_staticdeps/30_static_principle.tex b/manuscrit/60_staticdeps/30_static_principle.tex index 98c3b81..862cf30 100644 --- a/manuscrit/60_staticdeps/30_static_principle.tex +++ b/manuscrit/60_staticdeps/30_static_principle.tex @@ -1,28 +1,24 @@ \section{Static dependencies detection}\label{ssec:staticdeps_detection} -Depending on the type of dependencies considered, it is more or less difficult -to statically detect them. +Depending on their type, some dependencies are significantly harder to +statically detect than others. -\paragraph{Register-carried dependencies in straight-line code.} This case is -the easiest to statically detect, and is most often supported by code analyzers ----~for instance, \llvmmca{} supports it. The same strategy that was used to -dynamically find dependencies in \autoref{ssec:depsim} can still be used: a -shadow register file simply keeps track of which instruction last wrote each -register. +\textbf{Register-carried dependencies}, when in straight-line code, can be +detected by keeping track of which instruction last wrote each register in a +\emph{shadow register file}. This is most often supported by code analyzers +---~for instance, \llvmmca{} and \uica{} support it. -\paragraph{Register-carried, loop-carried dependencies.} Loop-carried -dependencies can, to some extent, be detected the same way. As the basic block -is always assumed to be the body of an infinite loop, a straight-line analysis -can be performed on a duplicated kernel. This strategy is \eg{} adopted by -\osaca{}~\cite{osaca2} (§II.D). +Loop-carried dependencies can, to some extent, be detected the same way. As the +basic block is always assumed to be the body of an infinite loop, a +straight-line analysis can be performed on a duplicated kernel. This strategy +is \eg{} adopted by \osaca{}~\cite{osaca2} (§II.D). When dealing only with +register accesses, this strategy is always sufficient: as each iteration always +executes the same basic block, it is not possible for an instruction to depend +on another instruction two iterations earlier or more. -When dealing only with register accesses, this -strategy is always sufficient: as each iteration always executes the same basic -block, it is not possible for an instruction to depend on another instruction -two iterations earlier or more. +\smallskip -\paragraph{Memory-carried dependencies in straight-line code.} Memory -dependencies, however, are significantly harder to tackle. While basic +\textbf{Memory-carried dependencies}, however, are significantly harder to tackle. While basic heuristics can handle some simple cases, in the general case two main difficulties arise: \begin{enumerate}[(i)] @@ -41,12 +37,14 @@ difficulties arise: Tracking memory-carried dependencies is, to the best of our knowledge, not done in code analyzers, as our results in \autoref{chap:CesASMe} suggests. -\paragraph{Loop-carried, memory-carried dependencies.} While the strategy -previously used for register-carried dependencies is sufficient to detect -loop-carried dependencies from one occurrence to the next one, it is not -sufficient at all times when the dependencies tracked are memory-carried. For -instance, in the second example from \autoref{lst:loop_carried_exn}, an -instruction depends on another two iterations ago. +\smallskip{} + +While the strategy previously used for register-carried dependencies is +sufficient to detect loop-carried dependencies from one occurrence to the next +one, it is not sufficient at all times when the dependencies tracked are +memory-carried. For instance, in the second example from +\autoref{lst:loop_carried_exn}, an instruction depends on another two +iterations ago. Dependencies can reach arbitrarily old iterations of a loop: in this example, \lstxasm{-8192(\%rbx, \%r10)} may be used to reach 1\,024 iterations back. @@ -55,7 +53,7 @@ necessarily \emph{relevant} from a performance analysis point of view. Indeed, if an instruction $i_2$ depends on a result previously produced by an instruction $i_1$, this dependency is only relevant if it is possible that $i_1$ is not yet completed when $i_2$ is considered for issuing ---~else, the -result is already produced, and $i_2$ needs not wait to execute. +result is already produced, and $i_2$ needs never wait to execute. The reorder buffer (ROB) of a CPU can be modelled as a sliding window of fixed size over \uops{}. In particular, if a \uop{} $\mu_1$ is not yet retired, the diff --git a/manuscrit/60_staticdeps/35_rob_proof.tex b/manuscrit/60_staticdeps/35_rob_proof.tex index d0d9127..7e0f4bf 100644 --- a/manuscrit/60_staticdeps/35_rob_proof.tex +++ b/manuscrit/60_staticdeps/35_rob_proof.tex @@ -7,11 +7,11 @@ performance}\label{ssec:staticdeps:rob_proof} for the subtrace $\left(I_r\right)_{p < r < p'}$. \end{definition} -\begin{theorem}[Long distance dependencies]\label{thm.longdist} - There exists $R \in \nat$, only dependent of microarchitectural parameters, - such that the presence or absence of a dependency between two instructions - that are separated by at least $R-1$ other \uops{} has no impact on the - performance of this kernel. +\begin{theorem}[Long distance dependencies]\label{thm.longdist} Given a kernel + $\kerK$, there exists $R \in \nat$, only dependent of microarchitectural + parameters, such that the presence or absence of a dependency between two + instructions that are separated by at least $R-1$ other \uops{} has no + impact on the performance of this kernel. More formally, let $\kerK$ be a kernel of $n$ instructions. Let $(I_p)_{p \in \nat}$ be the trace of $\kerK$'s instructions executed in a loop. For @@ -29,7 +29,10 @@ the reorder buffer. It contains only decoded \uops{}. Let us denote $i_d$ the \uop{} at position $d$ in the reorder buffer. Assume $i_d$ just got decoded. - We have that for every $q$ and $q'$ in $[0,R)$: + + \nopagebreak{}As the buffer is a circular FIFO, we have that for every $q$ + and $q'$ in + $[0,R)$: \[ (q-d-1) \% R<(q'-d-1) \% R\ \iff \ i_q \text{ was decoded before } i_{q'} @@ -45,7 +48,8 @@ buffer}: \begin{postulate}[Full reorder buffer] Let us denote by $i_d$ the \uop{} that just got decoded. - The reorder buffer is said to be full if for $q=(d+1) \% R$, \uop{} $i_q$ is not retired yet. + The reorder buffer is said to be \emph{full} if for $q=(d+1) \% R$, \uop{} + $i_q$ is not retired yet. If the reorder buffer is full, then instruction decoding is stalled. \end{postulate} @@ -58,10 +62,10 @@ To prove the theorem above, we need to state that any two in-flight \uops{} are For any instruction $I_p$, we denote as $Q_p$ the range of indices such that $(i_q)_{q\in Q_p}$ are the \uops{} obtained from the decoding of $I_p$. -Note that in practice, we may not have $\bigcup{}_p Q_p = [0, n)$, as \eg{} -branch mispredictions may introduce unwanted \uops{} in the pipeline. However, -as the worst case for the lemma below occurs when no such ``spurious'' \uops{} -are present, we may safely ignore such occurrences. +Note that in practice, it is possible that we do not have $\bigcup{}_p Q_p = +[0, n)$, as \eg{} branch mispredictions may introduce unwanted \uops{} in the +pipeline. However, as the worst case for the lemma below occurs when no such +``spurious'' \uops{} are present, we may safely ignore such occurrences. \begin{lemma}[Distance of in-flight \uops{}] For any pair of instructions $(I_p,I_{p'})$, and two corresponding \uops{}, diff --git a/manuscrit/60_staticdeps/40_staticdeps.tex b/manuscrit/60_staticdeps/40_staticdeps.tex index e1a5a52..7b803ce 100644 --- a/manuscrit/60_staticdeps/40_staticdeps.tex +++ b/manuscrit/60_staticdeps/40_staticdeps.tex @@ -1,8 +1,9 @@ \section{Staticdeps} The static analyzer we present, \staticdeps{}, only aims to tackle the -difficulty~\ref{memcarried_difficulty_arith} mentioned above: tracking -dependencies across arbitrarily complex pointer arithmetic. +difficulty~(\ref{memcarried_difficulty_arith}) mentioned in +\autoref{ssec:staticdeps_detection}: tracking dependencies across arbitrarily +complex pointer arithmetic. To do so, \staticdeps{} works at the basic-block level, unrolled enough times to fill the reorder buffer as detailed above; this way, arbitrarily @@ -19,7 +20,7 @@ two arbitrary expressions can be costly. \caption{The \staticdeps{} algorithm}\label{alg:staticdeps} \end{algorithm} -Instead, we use an heuristic based on random values. We consider the set $\calR +Instead, we use a heuristic based on random values. We consider the set $\calR = \left\{0, 1, \ldots, 2^{64}-1\right\}$ of values representable by a 64-bits unsigned integer; we extend this set to $\bar\calR = \calR \cup \{\bot\}$, where $\bot$ denotes an invalid value. We then proceed as previously for diff --git a/manuscrit/60_staticdeps/42_staticdeps_alg.tex b/manuscrit/60_staticdeps/42_staticdeps_alg.tex index be22e43..5ad3321 100644 --- a/manuscrit/60_staticdeps/42_staticdeps_alg.tex +++ b/manuscrit/60_staticdeps/42_staticdeps_alg.tex @@ -13,7 +13,7 @@ \smallskip{} \Function{read\_memory}{address} - \State{} \textbf{Assert} address \neq{} ⊥ + \State{} \textbf{Assert} address \neq{} $\bot$ \If{address \not\in{} shadow\_memory} \State{} shadow\_memory[address] $\gets$ \Call{fresh}{} \EndIf{} @@ -21,7 +21,7 @@ \EndFunction{} \Function{read\_register}{register} - \Comment{likewise, without dependency tracking} + \State{} \ldots \Comment{Likewise, without dependency tracking} \EndFunction{} \smallskip{} @@ -36,13 +36,13 @@ \EndIf{} \State{} \Return{} \Call{read\_memory}{addr} \ElsIf{expr == IntegerArithmeticOp(operator, op1, …, opN)} - \If{\Call{expr\_value}{op\_i} == ⊥ for any i} - \State{} \Return{} ⊥ + \If{\Call{expr\_value}{op\_i} == $\bot$ for any i} + \State{} \Return{} $\bot$ \EndIf{} - \State\Return{} semantics(operator)(\Comment{provided by Valgrind's Vex} + \State\Return{} semantics(operator)(\Comment{Provided by Valgrind's Vex} \State{} \quad \Call{expr\_value}{op1}, …, \Call{expr\_value}{opN}) \Else{} - \Return{} ⊥ + \Return{} $\bot$ \EndIf{} \EndFunction{} @@ -52,8 +52,8 @@ \State{} shadow\_register[reg] \gets{} \Call{expr\_value}{rhs} \ElsIf{lhs == Memory(addr\_expr)} \State{} addr \gets{} \Call{expr\_value}{addr\_expr} - \State{} last\_wrote\_at[addr] \gets{}(cur\_iter, cur\_instruction) - \State{} shadow\_memory[addr] <- \Call{expr\_value}{rhs} + \State{} last\_wrote\_at[addr] \gets{} (cur\_iter, cur\_instruction) + \State{} shadow\_memory[addr] \gets{} \Call{expr\_value}{rhs} \ElsIf{\ldots} \Comment{Etc.} \EndIf{} diff --git a/manuscrit/60_staticdeps/50_eval.tex b/manuscrit/60_staticdeps/50_eval.tex index 6e3294d..daddef9 100644 --- a/manuscrit/60_staticdeps/50_eval.tex +++ b/manuscrit/60_staticdeps/50_eval.tex @@ -220,8 +220,8 @@ analyzers. \subsection{Enriching \uica{}'s model} -To estimate the real gain in performance debugging scenarios, however, we -integrate \staticdeps{} into \uica{}. +To estimate the real gain in performance debugging scenarios, we integrate +\staticdeps{} into \uica{}. There is, however, a discrepancy between the two tools: while \staticdeps{} works at the assembly instruction level, \uica{} works at the \uop{} level. In @@ -233,11 +233,11 @@ for the \staticdeps{} analysis. We bridge this gap in a conservative way: whenever two instructions $i_1, i_2$ are found to be dependant, we add a dependency between each couple $\mu_1 \in -i_1, \mu_2 \in i_2$. This approximation is thus pessimistic, and should predict -execution times biased towards a slower computation kernel. A finer model, or a -finer (conservative) filtering of which \uops{} must be considered dependent ----~\eg{} a memory dependency can only come from a memory-related \uop{}~--- -may enhance the accuracy of our integration. +i_1, \mu_2 \in i_2$. This approximation is thus largely pessimistic, and should +predict execution times biased towards a slower computation kernel. A finer +model, or a finer (conservative) filtering of which \uops{} must be considered +dependent ---~\eg{} a memory dependency can only come from a memory-related +\uop{}~--- may enhance the accuracy of our integration. \begin{table} \centering diff --git a/manuscrit/60_staticdeps/99_conclusion.tex b/manuscrit/60_staticdeps/99_conclusion.tex index e16c381..bc8e282 100644 --- a/manuscrit/60_staticdeps/99_conclusion.tex +++ b/manuscrit/60_staticdeps/99_conclusion.tex @@ -17,8 +17,10 @@ context} pointed out in the previous chapter. \medskip{} Our evaluation of \staticdeps{} against a dynamic analysis baseline, -\depsim{}, shows that it only finds about 60\,\% of the existing dependencies. -We however enrich \uica{} with \staticdeps{}, and find that it performs on the +\depsim{}, shows that it finds between 95\,\% and 98\,\% of the existing +dependencies, depending on the metric used, giving us good confidence in the +reliability of \staticdeps{}. +We further enrich \uica{} with \staticdeps{}, and find that it performs on the full \cesasme{}'s dataset as well as \uica{} alone on the pruned dataset of \cesasme{}, removing memory-carried bottlenecks. From this, we conclude that \staticdeps{} is very successful at finding the data dependencies through diff --git a/manuscrit/90_wrapping_up/main.tex b/manuscrit/90_wrapping_up/main.tex index 7339630..749226d 100644 --- a/manuscrit/90_wrapping_up/main.tex +++ b/manuscrit/90_wrapping_up/main.tex @@ -44,15 +44,15 @@ create a full analyzer implementing this idea, such as what we did with \palmed{ for backend models, or such as \uica{}'s implementation, focusing on frontend analysis. -In hindsight, we advocate for the emergence of such a modular code analyzer. -It would maybe not be as convenient or well-integrated as ``production-ready'' -code analyzers, such as \llvmmca{} ---~which is packaged for Debian. It could, -however, greatly simplify the academic process of trying a new idea on any of -the three main models, by decorrelating them. It would also ease the -comparative evaluation of those ideas, while eliminating many of the discrepancies -between experimental setups that make an actual comparison difficult ---~the -reason that prompted us to make \cesasme{} in \autoref{chap:CesASMe}. Indeed, -with such a modular tool, it would be easy to run the same experiment, in the -same conditions, while only changing \eg{} the frontend model but keeping a -well-tried backend model. +In hindsight, we advocate for the emergence of such a modular code analyzer. It +would maybe not be as convenient or well-integrated as ``production-ready'' +code analyzers, such as \llvmmca{} ---~which is officially packaged for Debian. +It could, however, greatly simplify the academic process of trying a new idea +on any of the three main models, by decorrelating them. It would also ease the +comparative evaluation of those ideas, while eliminating many of the +discrepancies between experimental setups that make an actual comparison +difficult ---~the reason that prompted us to make \cesasme{} in +\autoref{chap:CesASMe}. Indeed, with such a modular tool, it would be easy to +run the same experiment, in the same conditions, while only changing \eg{} the +frontend model but keeping a well-tried backend model. diff --git a/manuscrit/include/packages.tex b/manuscrit/include/packages.tex index 6912d49..7c0245f 100644 --- a/manuscrit/include/packages.tex +++ b/manuscrit/include/packages.tex @@ -88,6 +88,7 @@ \newfloat{lstfloat}{htbp}{lop} \floatname{lstfloat}{Listing} \def\lstfloatautorefname{Listing} +\DeclareCaptionSubType{lstfloat} \newfloat{algorithm}{htbp}{lop} \floatname{algorithm}{Algorithm}