From 625f21739a013ff45757d10927e2ce71e7daa9d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Tue, 26 Sep 2023 16:39:12 +0200
Subject: [PATCH] Palmed: first writeup of Benchsuite-bb

---
 manuscrit/30_palmed/35_benchsuite_bb.tex  | 202 ++++++++++++++++++++++
 manuscrit/30_palmed/40_palmed_results.tex |  63 -------
 manuscrit/30_palmed/main.tex              |   1 +
 manuscrit/biblio/bench_suites.bib         |  11 ++
 manuscrit/include/macros.tex              |   1 +
 manuscrit/include/packages.tex            |   5 +
 6 files changed, 220 insertions(+), 63 deletions(-)
 create mode 100644 manuscrit/30_palmed/35_benchsuite_bb.tex

diff --git a/manuscrit/30_palmed/35_benchsuite_bb.tex b/manuscrit/30_palmed/35_benchsuite_bb.tex
new file mode 100644
index 0000000..9d38a7b
--- /dev/null
+++ b/manuscrit/30_palmed/35_benchsuite_bb.tex
@@ -0,0 +1,202 @@
+\section{Finding basic blocks to evaluate \palmed{}}
+
+In the context of all that is described above, my main task in the environment
+of \palmed{} was to build a system able to evaluate a produced mapping on a
+given architecture.
+
+Some tools, such as \pmevo{}~\cite{PMEvo}, use randomly-sampled basic blocks
+for their evaluation.  However, random generation may yield basic blocks that
+are not representative of the various workloads our model might be used on.
+Thus, while arbitrarily or randomly generated microbenchmarks were well suited
+to the data acquisition phase needed to generate the model, the kernels on
+which the model would be evaluated could not be arbitrary, but must come from
+real-world programs.
+
+\subsection{Benchmark suites}
+
+Models generated by \palmed{} are meant to be used on basic blocks that are
+computationally intensive ---~so that the backend is actually the relevant
+resource to monitor, compared to \eg{} frontend- or input/output-bound code~---,
+running in steady-state ---~that is, which is the body of a loop long enough to
+be reasonably considered infinite for performance modelling purposes. The basic
+blocks used to evaluate \palmed{} should thus be reasonably close from these
+criteria.
+
+For this reason, we evaluate \palmed{} on basic blocks extracted from
+two well-known benchmark suites instead: Polybench and SPEC CPU 2017.
+
+\paragraph{Polybench} is a suite of benchmarks built out of 30 kernels of
+numerical computation~\cite{bench:polybench}. Its benchmarks are
+domain-specific and centered around scientific computation, mathematical
+computation, image processing, etc. As the computation kernels are
+clearly identifiable in the source code, extracting the relevant basic blocks
+is easy, and fits well for our purpose. It is written in C language. Although
+it is not under a free/libre software license, it is free to use and
+open-source.
+
+\paragraph{SPEC CPU 2017} is a suite of benchmarks meant to be CPU
+intensive~\cite{bench:spec}. It is composed of both integer and floating-point
+based benchmarks, extracted from (mainly open source) real-world software, such
+as \texttt{gcc}, \texttt{imagemagick}, \ldots{} Its main purpose is to obtain
+metrics and compare CPUs on a unified workload; it is however commonly used
+throughout the literature to evaluate compilers, optimizers, code analyzers,
+etc. It is split into four variants: integer and floating-point, combined with
+speed ---~time to perform a single task~--- and rate ---~throughput for
+performing a flow of tasks. Most benchmarks exist in both speed and rate mode.
+The SPEC suite is under a paid license, and cannot be redistributed, which
+makes peer-review and replication of experiments ---~\eg{} for artifact
+review~--- complicated.
+
+\subsection{Manually extracting basic blocks}
+
+Our first approach, that we used to extract basic blocks from the two benchmark
+suites introduced above for the evaluation included in our article for
+\palmed{}~\cite{palmed}, was very manual. We use different ---~though
+similar~--- approaches for Polybench and SPEC\@.
+
+In the case of Polybench, we compile multiple versions of each benchmark
+(\texttt{-O2}, \texttt{-O3} and tiled using the Pluto
+optimizer~\cite{tool:pluto}). We then use \qemu~\cite{tool:qemu} to extract
+\textit{translation blocks} ---~very akin to basic blocks~--- and an occurrence
+count for each of those. We finally select the basic blocks that have enough
+occurrences to be body loops.
+
+In the case of SPEC, we replace \qemu{} with the Linux \perf{} profiler, as
+individual benchmarks of the suite are heavier than Polybench benchmarks,
+making \qemu{}'s instrumentation overhead impractical. While \perf{} provides
+us with occurrence statistics, it does not chunk the program into basic blocks;
+we use an external disassembler and heuristics on the instructions to do this
+chunking. We describe both aspects ---~profiling and chunking~--- with more
+details below.
+
+Altogether, this method generates, for x86-64 processors, 13\,778 SPEC-based
+and 2\,664 polybench-based basic blocks.
+
+\subsection{Automating basic block extraction}
+
+This manual method, however, has multiple drawbacks. It is, obviously, tedious
+to manually compile and run a benchmark suite, then extract basic blocks using
+two collections of scripts depending on which suite is used. It is also
+impractical that the two benchmark suites are scrapped using very similar, yet
+different techniques: as the basic blocks are not chunked using the same code,
+they might have slightly different properties.
+
+Most importantly, this manual extraction is not reproducible. This comes with
+two problems.
+\begin{itemize}
+    \item If the dataset was to be lost, or if another researcher wanted to
+        reproduce our results, the exact same dataset could not be recreated.
+        The same general procedure could be followed again, but code and
+        scripts would have to be re-written, manually typed and undocumented
+        shell lines re-written, etc.
+    \item The same consideration applies to porting the dataset to another ISA.
+        Indeed, as the dataset consists of assembly-level basic-blocks, it
+        cannot be transferred to another ISA: it has to be re-generated from
+        source-level benchmarks. This poses the same problems as the first
+        point.
+\end{itemize}
+
+This second point particularly motivated us to automate the basic block
+extraction procedure when \palmed{} ---~and the underlying \pipedream{}~---
+were extended to produce mappings for ARM processors.
+
+\medskip{}
+
+Our automated extraction tool, \benchsuitebb{}, is able to extract basic blocks
+from Polybench and SPEC. Although we do not use it to evaluate \palmed{}, it
+also supports the extraction of basic blocks from Rodinia~\cite{bench:rodinia},
+a benchmark suite targeted towards heterogeneous computing, and exhibiting
+various usual kernels, such as K-means, backpropagation, BFS, \ldots{}
+
+For the most part, \benchsuitebb{} implements the manual approach used for
+SPEC\@. On top of an abstraction layer meant to unify the interface to all
+benchmark suites, it executes the various compiled binaries while profiling
+them through \perf{}, and chunks the relevant parts into basic blocks using a
+disassembler.
+
+\paragraph{Profiling with \perf{}.}
+The \perf{} profiler~\cite{tool:perf} is part of the Linux kernel. It works by
+sampling the current program counter (as well as the stack, if requested, to
+obtain a stack trace) upon either event occurrences, such as number of elapsed
+CPU cycles, context switches, cache misses, \ldots, or simply at a fixed,
+user-defined frequency.
+
+In our case, we use this second mode to uniformly sample the program counter
+across a run. We recover the output of the profiling as a \textit{raw trace}
+with \lstbash{perf report -D}.
+
+\paragraph{ELF natigation: \texttt{pyelftools} and \texttt{capstone}.}
+To trace this program counter samplings back to basic blocks, we then need to
+chunk the relevant sections of the ELF binary down to basic blocks. For this,
+we use two tools: \texttt{pyelftools} and \texttt{capstone}.
+
+The \texttt{pyelftools} Python library is able to parse and decode many
+informations contained in an ELF file. In our case, it allows us to find the
+\texttt{.text} section of the input binary, search for symbols, find the symbol
+containing a given program counter, extract the raw assembled bytes between two
+addresses, etc.
+
+The \texttt{capstone} disassembler, on the other hand, allows to disassemble a
+portion of assembled binary back to assembly. It supports many ISAs, among
+which x86-64 and ARM, the two ISAs we investigate in this manuscript. It is
+able to extract relevant details out of an instruction: which operands,
+registers, \ldots{} it uses; which broader group of instruction it belongs to;
+etc. These groups of instructions, in our case, are particularly useful, as it
+allows us to find control flow instructions without writing code specific to an
+ISA. These control-altering instructions are jumps, calls and returns. We are
+also able to trace a (relative) jump to its jump site, enabling us later to
+have a finer definition of basic blocks.
+
+\begin{algorithm}
+    \begin{algorithmic}
+        \Function{bbsOfSymbol}{symbol} \Comment{Memoized, computed only once
+        per symbol}
+            \State instructions $\gets$ disassemble(bytesFor(symbol))
+            \Comment{Uses both pyelftools and capstone}
+
+            \State flowSites $\gets \emptyset$
+            \State jumpSites $\gets \emptyset$
+
+            \For{instr $\in$ instructions}
+                \If{isControlFlow(instr)}
+                    \State flowSites $\gets \text{flowSites} \cup \{\text{next(instr).addr}\}$
+                    \If{isJump(instr)}
+                        \State jumpSites $\gets \text{jumpSites} \cup \{\text{instr.jump\_addr}\}$
+                    \EndIf
+                \EndIf
+            \EndFor
+
+            \State \textbf{return} instructions.splitAt($\text{flowSites} \cup
+                \text{jumpSites}$)
+        \EndFunction
+
+        \medskip{}
+
+        \Function{bbsOfPcs}{pcs}
+            \State occurences $\gets \{\}$
+            \For{pc $\in$ pcs}
+                \State bbs $\gets$ bbsOfSymbol(symbolOfPc(pc))
+                \State bb $\gets$ bissect(pc, bbs)
+                \State $\text{occurrences}[\text{bb}] ++$
+            \EndFor
+            \State \textbf{return} occurrences
+        \EndFunction
+    \end{algorithmic}
+
+    \caption{Basic block extraction procedure, given a \perf{}-obtain list of
+    program counters.}\label{alg:bb_extr_procedure}
+\end{algorithm}
+
+\paragraph{Extracting basic blocks.} We describe the basic block extraction,
+given the \perf{}-provided list of sampled program counters, in
+\autoref{alg:bb_extr_procedure}. For each program counter, we find the ELF
+symbol it belongs to, and decompose this whole symbol into basic blocks ---~we
+memoize this step to do it only once per symbol. We then bissect the basic
+block corresponding to the current PC from the list of obtained basic blocks to
+count the occurrences of each block.
+
+To split a symbol into basic blocks, we determine using \texttt{capstone} its
+set of \emph{flow sites} and \emph{jump sites}. The former is the set of
+addresses just after a control flow instruction, while the latter is the set of
+addresses to which jump instructions may jump. We then split the
+straight-line code of the symbol using the union of both sets as boundaries.
diff --git a/manuscrit/30_palmed/40_palmed_results.tex b/manuscrit/30_palmed/40_palmed_results.tex
index 49f6ddc..92a3bc0 100644
--- a/manuscrit/30_palmed/40_palmed_results.tex
+++ b/manuscrit/30_palmed/40_palmed_results.tex
@@ -3,69 +3,6 @@
 The main contribution I made to \palmed{} is its evaluation harness and
 procedure. \todo{}
 
-\subsection{Basic blocks from benchmark suites}
-
-Models generated by \palmed{} are meant to be used on basic blocks that are
-computationally intensive ---~so that the backend is actually the relevant
-resource to monitor, compared to \eg{} frontend- or input/output-bound code~---,
-running in steady-state ---~that is, which is the body of a loop long enough to
-be reasonably considered infinite for performance modelling purposes. The basic
-blocks used to evaluate \palmed{} should thus be reasonably close from these
-criteria.
-
-Some tools, such as \pmevo{}~\cite{PMEvo}, use randomly-sampled basic blocks
-for their evaluation. This approach, however, may yield basic blocks that do
-not fit in those criteria; furthermore, it may not be representative of
-real-life code on which the users of the tool expect it to be accurate.
-
-For this reason, we evaluate \palmed{} on basic blocks extracted from
-two well-known benchmark suites instead: Polybench and SPEC CPU 2017.
-
-\paragraph{Polybench} is a suite of benchmarks built out of 30 kernels of
-numerical computation~\cite{bench:polybench}. Its benchmarks are
-domain-specific and centered around scientific computation, mathematical
-computation, image processing, etc. As the computation kernels are
-clearly identifiable in the source code, extracting the relevant basic blocks
-is easy, and fits well for our purpose. It is written in C language. Although
-it is not under a free/libre software license, it is free to use and
-open-source.
-
-We compile multiple versions of each benchmark (\texttt{-O2}, \texttt{-O3} and
-tiled using the Pluto optimizer~\cite{tool:pluto}), then extract the basic
-block corresponding to the benchmarks' kernels using \qemu~\cite{tool:qemu},
-gathering translation blocks and occurrence statistics.
-
-\paragraph{SPEC CPU 2017} is a suite of benchmarks meant to be CPU
-intensive~\cite{bench:spec}. It is composed of both integer and floating-point
-based benchmarks, extracted from (mainly open source) real-world software, such
-as \texttt{gcc}, \texttt{imagemagick}, \ldots{} Its main purpose is to obtain
-metrics and compare CPUs on a unified workload; it is however commonly used
-throughout the literature to evaluate compilers, optimizers, code analyzers,
-etc. It is split into four variants: integer and floating-point, combined with
-speed ---~time to perform a single task~--- and rate ---~throughput for
-performing a flow of tasks. Most benchmarks exist in both speed and rate mode.
-The SPEC suite is under a paid license, and cannot be redistributed, which
-makes peer-review and replication of experiments ---~\eg{} for artifact
-review~--- complicated.
-
-In the case of SPEC, there is no clear kernel available for each benchmark;
-extracting basic blocks to evaluate \palmed{} is not trivial. We manually
-extract the relevant basic blocks using a profiling-based approach with Linux
-\perf{}, as the \qemu{}-based solution used for Polybench would be too costly
-for SPEC\@.
-
-\bigskip{}
-
-Altogether, this method generates, for x86-64 processors, 13\,778 SPEC-based
-and 2\,664 polybench-based basic blocks.
-
-\medskip{}
-
-We automate and describe in detail the \perf{}-based method later in
-\qtodo{ref}; however, this automation allows us to also extract basic blocks
-from these same benchmark suites, compiled for ARMv8a.
-
-
 \subsection{Evaluation harness}
 
 We implement into \palmed{} an evaluation harness to evaluate it both against
diff --git a/manuscrit/30_palmed/main.tex b/manuscrit/30_palmed/main.tex
index 370d472..e185278 100644
--- a/manuscrit/30_palmed/main.tex
+++ b/manuscrit/30_palmed/main.tex
@@ -4,5 +4,6 @@
 \input{10_resource_models.tex}
 \input{20_palmed_design.tex}
 \input{30_pipedream.tex}
+\input{35_benchsuite_bb.tex}
 \input{40_palmed_results.tex}
 \input{50_other_contributions.tex}
diff --git a/manuscrit/biblio/bench_suites.bib b/manuscrit/biblio/bench_suites.bib
index 7c4c3c0..8f0ce75 100644
--- a/manuscrit/biblio/bench_suites.bib
+++ b/manuscrit/biblio/bench_suites.bib
@@ -45,3 +45,14 @@ keywords = {program characterization, dynamic analysis, code optimization},
 location = {Montr\'{e}al, QC, Canada},
 series = {CGO 2023}
 }
+
+@INPROCEEDINGS{bench:rodinia,
+	author={Che, Shuai and Boyer, Michael and Meng, Jiayuan and Tarjan, David and Sheaffer, Jeremy W. and Lee, Sang-Ha and Skadron, Kevin},
+	booktitle={2009 IEEE International Symposium on Workload Characterization (IISWC)}, 
+	title={Rodinia: A benchmark suite for heterogeneous computing}, 
+	year={2009},
+	volume={},
+	number={},
+	pages={44-54},
+	doi={10.1109/IISWC.2009.5306797}
+}
diff --git a/manuscrit/include/macros.tex b/manuscrit/include/macros.tex
index 0c2f98f..2494c42 100644
--- a/manuscrit/include/macros.tex
+++ b/manuscrit/include/macros.tex
@@ -44,6 +44,7 @@
 \newcommand{\bhive}{\texttt{BHive}}
 \newcommand{\anica}{\texttt{AnICA}}
 \newcommand{\cesasme}{\texttt{CesASMe}}
+\newcommand{\benchsuitebb}{\texttt{benchsuite-bb}}
 
 \newcommand{\gdb}{\texttt{gdb}}
 
diff --git a/manuscrit/include/packages.tex b/manuscrit/include/packages.tex
index bcce0f1..d9b49dc 100644
--- a/manuscrit/include/packages.tex
+++ b/manuscrit/include/packages.tex
@@ -26,6 +26,7 @@
 \usepackage{wrapfig}
 \usepackage{float}
 \usepackage{tikz}
+\usepackage{algpseudocode}
 \usepackage[bottom]{footmisc}  % footnotes are below floats
 \usepackage[final]{microtype}
 
@@ -85,3 +86,7 @@
 \newfloat{lstfloat}{htbp}{lop}
 \floatname{lstfloat}{Listing}
 \def\lstfloatautorefname{Listing}
+
+\newfloat{algorithm}{htbp}{lop}
+\floatname{algorithm}{Algorithm}
+\def\algorithmautorefname{Algorithm}