From 3511d27516bb27fcd4579540b3f98f65493c6077 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr> Date: Thu, 28 Sep 2023 17:45:31 +0200 Subject: [PATCH] Staticdeps: up to evaluation (not yet started) --- manuscrit/50_CesASMe/main.tex | 2 +- manuscrit/60_staticdeps/10_types_of_deps.tex | 17 +++-- manuscrit/60_staticdeps/20_dynamic.tex | 40 ++++++++++ .../60_staticdeps/30_static_principle.tex | 75 +++++++++++++++++++ manuscrit/60_staticdeps/40_staticdeps.tex | 63 ++++++++++++++++ manuscrit/biblio/misc.bib | 7 ++ manuscrit/include/macros.tex | 2 + 7 files changed, 200 insertions(+), 6 deletions(-) diff --git a/manuscrit/50_CesASMe/main.tex b/manuscrit/50_CesASMe/main.tex index f67f3d9..0c75c78 100644 --- a/manuscrit/50_CesASMe/main.tex +++ b/manuscrit/50_CesASMe/main.tex @@ -1,5 +1,5 @@ \chapter{A more systematic approach to throughput prediction performance -analysis: \cesasme{}} +analysis: \cesasme{}}\label{chap:CesASMe} \input{00_intro.tex} \input{02_measuring_exec_time.tex} diff --git a/manuscrit/60_staticdeps/10_types_of_deps.tex b/manuscrit/60_staticdeps/10_types_of_deps.tex index 41b279a..713654e 100644 --- a/manuscrit/60_staticdeps/10_types_of_deps.tex +++ b/manuscrit/60_staticdeps/10_types_of_deps.tex @@ -50,9 +50,10 @@ however, other channels. As we saw in the introduction to this chapter, as well as in the previous chapter, dependencies can also be \emph{memory-carried}, in more or less -straightforward ways, such as in the following examples, where the last line -always depend on the first: +straightforward ways, such as in the examples from +\autoref{lst:mem_carried_exn}, where the last line always depend on the first. +\begin{lstfloat}[h!] \begin{minipage}[t]{0.32\linewidth} \begin{lstlisting}[language={[x86masm]Assembler}] add %rax, (%rbx) @@ -70,6 +71,8 @@ lea 16(%rbx), %r10 add %rax, (%rbx) add -16(%r10), %rcx\end{lstlisting} \end{minipage}\hfill +\caption{Examples of memory-carried dependencies.}\label{lst:mem_carried_exn} +\end{lstfloat} \smallskip{} @@ -90,8 +93,10 @@ with a large emphasis on memory-carried dependencies. \paragraph{Presence of loops.} The previous examples were all pieces of \emph{straight-line code} in which a dependency arose. However, many -dependencies are actually \emph{loop-carried}, such as the following: +dependencies are actually \emph{loop-carried}, such as those in +\autoref{lst:loop_carried_exn}. +\begin{lstfloat} \begin{minipage}[t]{0.48\linewidth} \begin{lstlisting}[language={[x86masm]Assembler}] # Compute sum(A), %rax points to A @@ -103,11 +108,13 @@ loop: \end{minipage}\hfill \begin{minipage}[t]{0.48\linewidth} \begin{lstlisting}[language={[x86masm]Assembler}] -# Compute B[i] = A[i] + B[i-1] +# Compute B[i] = A[i] + B[i-2] loop: - mov -8(%rbx, %r10), (%rbx, %r10) + mov -16(%rbx, %r10), (%rbx, %r10) add (%rax, %r10), (%rbx, %r10) add $8, %r10 jmp loop \end{lstlisting} \end{minipage}\hfill +\caption{Examples of loop-carried dependencies.}\label{lst:loop_carried_exn} +\end{lstfloat} diff --git a/manuscrit/60_staticdeps/20_dynamic.tex b/manuscrit/60_staticdeps/20_dynamic.tex index caa5b75..a2725b5 100644 --- a/manuscrit/60_staticdeps/20_dynamic.tex +++ b/manuscrit/60_staticdeps/20_dynamic.tex @@ -38,3 +38,43 @@ Valgrind, which will re-compile it to a native binary before running it. While this intermediate representation, called \vex{}, is convenient to instrument a binary, it may further be used as a way to obtain \emph{semantics} for some assembly code, independently of the Valgrind framework. + +\subsection{Depsim}\label{ssec:depsim} + +The tool we write to extract runtime-gathered dependencies, \depsim{}, is +able to extract dependencies through both registers, memory and temporary +variables ---~in its intermediate representation, Valgrind keeps some values +assigned to temporary variables in static single-assignment (SSA) form. +It however supports a flag to detect only memory-carried dependencies, as this +will be useful to evaluate our static algorithm later. + +As a dynamic tool, the distinction between straight-line code and loop-carried +dependencies is irrelevant, as the analysis follows the actual program flow. + +\medskip{} + +In order to track dependencies, each basic block of the program is +instrumented. Dependencies are stored as a hash table and represented as a +pair of source and destination program counter; they are mapped to a number of +encountered occurrences. + +Dependencies through temporaries are, by construction, resident to a single +basic block ---~they are thus statically detected at instrumentation time. At +runtime, the occurrence count of those dependencies is updated whenever the +basic block is executed. + +For both register- and memory-carried dependencies, each write is instrumented +by adding a runtime write to a \emph{shadow} register file or memory, noting +that the written register or memory address was last written at the current +program counter. Each read, in turn, is instrumented by adding a fetch to this +shadow register file or memory, retrieving the last program counter at which +this location was written to; the dependency count between this program counter +and the current program counter is then incremented. + +In practice, the shadow register file is simply implemented as an array +holding, for each register id, the last program counter that wrote at this +location. The shadow memory is instead implemented as a hash table. + +At the end of the run, all the dependencies retrieved are reported. Care is +taken to translate back the runtime program counters to addresses in the +original ELF files, using the running process' memory map. diff --git a/manuscrit/60_staticdeps/30_static_principle.tex b/manuscrit/60_staticdeps/30_static_principle.tex index cffcb8a..72a0578 100644 --- a/manuscrit/60_staticdeps/30_static_principle.tex +++ b/manuscrit/60_staticdeps/30_static_principle.tex @@ -1 +1,76 @@ \section{Static dependencies detection} + +Depending on the type of dependencies considered, it is more or less difficult +to statically detect them. + +\paragraph{Register-carried dependencies in straight-line code.} This case is +the easiest to statically detect, and is most often supported by code analyzers +---~for instance, \llvmmca{} supports it. The same strategy that was used to +dynamically find dependencies in \autoref{ssec:depsim} can still be used: a +shadow register file simply keeps track of which instruction last wrote each +register. + +\paragraph{Register-carried, loop-carried dependencies.} Loop-carried +dependencies can, to some extent, be detected the same way. As the basic block +is always assumed to be the body of an infinite loop, a straight-line analysis +can be performed on a duplicated kernel. This strategy is \eg{} adopted by +\osaca{}~\cite{osaca2} (§II.D). + +When dealing only with register accesses, this +strategy is always sufficient: as each iteration always executes the same basic +block, it is not possible for an instruction to depend on another instruction +two iterations earlier or more. + +\paragraph{Memory-carried dependencies in straight-line code.} Memory +dependencies, however, are significantly harder to tackle. While basic +heuristics can handle some simple cases, in the general case two main +difficulties arise: +\begin{enumerate}[(i)] + \item{}\label{memcarried_difficulty_alias} pointers may \emph{alias}, \ie{} + point to the same address or array; for instance, if \reg{rax} points + to an array, it may be that \reg{rbx} points to $\reg{rax} + 8$, making + the detection of such a dependency difficult; + \item{}\label{memcarried_difficulty_arith} arbitrary arithmetic operations + may be performed on pointers, possibly through diverting paths: \eg{} + it might be necessary to detect that $\reg{rax} + 16 << 2$ is identical + to $\reg{rax} + 128 / 2$; this requires semantics for assembly + instructions and tracking formal expressions across register values + ---~and possibly even memory. +\end{enumerate} + +Tracking memory-carried dependencies is, to the best of our knowledge, not done +in code analyzers, as our results in \autoref{chap:CesASMe} suggests. + +\paragraph{Loop-carried, memory-carried dependencies.} While the strategy +previously used for register-carried dependencies is sufficient to detect +loop-carried dependencies from one occurrence to the next one, it is not +sufficient at all times when the dependencies tracked are memory-carried. For +instance, in the second example from \autoref{lst:loop_carried_exn}, an +instruction depends on another two iterations ago. + +Dependencies can reach arbitrarily old iterations of a loop: in this example, +\lstxasm{-8192(\%rbx, \%r10)} may be used to reach 1\,024 iterations back. +However, while far-reaching dependencies may \emph{exist}, they are not +necessarily \emph{relevant} from a performance analysis point of view. Indeed, +if an instruction $i_2$ depends on a result previously produced by an +instruction $i_1$, this dependency is only relevant if it is possible that +$i_1$ is not yet completed when $i_2$ is considered for issuing ---~else, the +result is already produced, and $i_2$ needs not wait to execute. + +The reorder buffer (ROB) of a CPU can be modelled as a sliding window of fixed +size over \uops{}. In particular, if a \uop{} $\mu_1$ is not yet retired, the +ROB may not contain \uops{} more than the ROB's size ahead of $\mu_1$. This is +in particular also true for instructions, as the vast majority of instructions +decode to at least one \uop{}\footnote{Some \texttt{mov} instructions from + register to register may, for instance, only have an impact on the renamer; +no \uops{} are dispatched to the backend.}. + +A possible solution to detect loop-carried dependencies in a kernel $\kerK$ is +thus to unroll it until it contains about $\card{\text{ROB}} + +\card{\kerK}$. This ensures that every instruction in the last kernel can find +dependencies reaching up to $\card{\text{ROB}}$ back. + +On Intel CPUs, the reorder buffer size contained 224 \uops{} on Skylake (2015), +or 512 \uops{} on Golden Cove (2021)~\cite{wikichip_intel_rob_size}. These +sizes are small enough to reasonably use this solution without excessive +slowdown. diff --git a/manuscrit/60_staticdeps/40_staticdeps.tex b/manuscrit/60_staticdeps/40_staticdeps.tex index 77dc877..2ccef69 100644 --- a/manuscrit/60_staticdeps/40_staticdeps.tex +++ b/manuscrit/60_staticdeps/40_staticdeps.tex @@ -1 +1,64 @@ \section{The \staticdeps{} heuristic} + +The static analyzer we present, \staticdeps{}, only aims to tackle the +difficulty~\ref{memcarried_difficulty_arith} mentioned above: tracking +dependencies across arbitrarily complex pointer arithmetic. + +To do so, \staticdeps{} works at the basic-block level, unrolled enough times +to fill the reorder buffer as detailed above; this way, arbitrarily +long-reaching relevant loop-carried dependencies can be detected. + +This problem could be solved using symbolic calculus algorithms. However, those +algorithms are not straightforward to implement, and the equality test between +two arbitrary expressions can be costly. + +\medskip{} +Instead, we use an heuristic based on random values. We consider the set $\calR += \left\{0, 1, \ldots, 2^{64}-1\right\}$ of values representable by a 64-bits +unsigned integer; we extend this set to $\bar\calR = \calR \cup \{\bot\}$, +where $\bot$ denotes an invalid value. We then proceed as previously for +register-carried dependencies, applying the following principles. + +\smallskip{} +\begin{itemize} + \item{} Whenever an unknown value is read, either from a register or from + memory, generate a fresh value from $\calR$, uniformly sampled at + random. This value is saved to a shadow register file or memory, and + will be used again the next time this same data is accessed. + + \item{} Whenever an integer arithmetic operation is encountered, compute + the result of the operation and save the result to the shadow register + file or memory. + + \item{} Whenever another kind of operation, or an operation that is + unsupported, is encountered, save the destination operand as $\bot$; + this operation is assumed to not be valid pointer arithmetic. + Operations on $\bot$ always yield $\bot$ as a result. + + \item{} Whenever writing to a memory location, compute the written address + using the above principles, and proceed as with a dynamic analysis, + keeping track of the instruction that last wrote to a memory address. + + \item{} Whenever reading from a memory location, compute the read address + using the above principles, and generate a dependency from the current + instruction to the instruction that last wrote to this address (if + known). +\end{itemize} + +The semantics needed to compute encountered operations are obtained by lifting +the kernel's assembly to \valgrind{}'s \vex{} intermediary representation. + +\medskip{} + +This first analysis provides us with a raw list of dependencies across +iterations of the considered basic block. We then ``re-roll'' the unrolled +kernel by transcribing each dependency to a triplet $(\texttt{source\_insn}, +\texttt{dest\_insn}, \Delta{}k)$, where the first two elements are the source +and destination instruction of the dependency \emph{in the original, +non-unrolled kernel}, and $\Delta{}k$ is the number of iterations of the kernel +between the source and destination instruction of the dependency. + +Finally, we filter out spurious dependencies: each dependency found should +occur for each kernel iteration $i$ at which $i + \Delta{}k$ is within bounds. +If the dependency is found for less than $80\,\%$ of those iterations, the +dependency is declared spurious and is dropped. diff --git a/manuscrit/biblio/misc.bib b/manuscrit/biblio/misc.bib index 8ce47b0..36d7ead 100644 --- a/manuscrit/biblio/misc.bib +++ b/manuscrit/biblio/misc.bib @@ -133,3 +133,10 @@ howpublished={\url{https://www.arm.com/company/news/2023/09/building-the-future-of-computing-on-arm}}, } +@misc{wikichip_intel_rob_size, + title={Intel Details Golden Cove: Next-Generation Big Core For Client and Server SoCs}, + author={{WikiChip}}, + year=2021, + month=08, + howpublished={\url{https://fuse.wikichip.org/news/6111/intel-details-golden-cove-next-generation-big-core-for-client-and-server-socs/}} +} diff --git a/manuscrit/include/macros.tex b/manuscrit/include/macros.tex index 8be9a2a..7aa4105 100644 --- a/manuscrit/include/macros.tex +++ b/manuscrit/include/macros.tex @@ -21,6 +21,8 @@ \newcommand{\mucount}{\#_{\mu}} \newcommand{\ceil}[1]{\left\lceil{} #1 \right\rceil{}} +\newcommand{\card}[1]{\left| #1 \right|} + % Names \newcommand{\fgruber}{Fabian \textsc{Gruber}}