Parametric frontend: add Fabrice's suggestions
This commit is contained in:
parent
7dc4ec9935
commit
8c0e5e4710
2 changed files with 49 additions and 13 deletions
|
@ -92,10 +92,10 @@ may prove to be a huge frontend slowdown, especially when such instructions
|
|||
cross an instruction cache line boundary~\cite{uica}.
|
||||
|
||||
Processors implementing ISAs subject to decoding bottleneck typically also
|
||||
feature a decoded \uop{} cache. The typical hit rate of this cache is about
|
||||
80\%~\cites[Section
|
||||
B.5.7.2]{ref:intel64_software_dev_reference_vol1}{dead_uops}. However,
|
||||
code analyzers are concerned with loops and, more generally, hot code portions.
|
||||
feature a decoded \uop{} cache, or \emph{decoded stream buffer} (DSB). The
|
||||
typical hit rate of this cache is about 80\%~\cites[Section
|
||||
B.5.7.2]{ref:intel64_software_dev_reference_vol1}{dead_uops}. However, code
|
||||
analyzers are concerned with loops and, more generally, hot code portions.
|
||||
Under such conditions, we expect this cache, once hot in steady-state, to be
|
||||
very close to a 100\% hit rate. In this case, only the dispatch throughput will
|
||||
be limiting, and modeling the decoding bottlenecks becomes irrelevant.
|
||||
|
@ -109,12 +109,30 @@ be investigated if the model does not reach the expected accuracy.
|
|||
|
||||
\begin{itemize}
|
||||
|
||||
\item{} Intel CPUs use a Loop Stream Detector (LSD) to keep
|
||||
in the decode queue a whole loop's body of \uops{} if the frontend detects that a
|
||||
\item{} We introduced just above the DSB (\uop{} cache). This model
|
||||
considers that the DSB will never be the cause of a bottleneck and
|
||||
that, instead, the number of dispatched \uops{} per cycle will always
|
||||
bottleneck before. This might not be true, as DSBs are complex in
|
||||
themselves already~\cite{uica}.
|
||||
|
||||
\item{} Intel CPUs use a Loop Stream Detector (LSD) to keep in the decode
|
||||
queue a whole loop's body of \uops{} if the frontend detects that a
|
||||
small enough loop is repeated~\cite{uica, dead_uops}. In this case,
|
||||
\uops{} are repeatedly streamed from the decode queue, without even the
|
||||
necessity to hit a cache. We are unaware of
|
||||
other architectures with such a feature.
|
||||
necessity to hit a cache. We are unaware of similar features in other
|
||||
commercial processors. In embedded programming, however, \emph{hardware
|
||||
loops} --~which are set up explicitly by the programmer~-- achieve,
|
||||
among others, the same goal~\cite{hardware_loops_patent}.
|
||||
|
||||
\item{} The \emph{branch predictor} of a CPU is responsible for guessing,
|
||||
before the actual logic is computed, whether a conditional jump will be
|
||||
taken. A misprediction forces the frontend to re-populate its queues
|
||||
with instructions from the branch actually taken and typically stalls
|
||||
the pipeline for several cycles~\cite{branch_pred_penalty}. Our model,
|
||||
however, does not include a branch predictor for much the same reason
|
||||
that it does not include complex decoder: in steady-state, in a hot
|
||||
code portion, we expect the branch predictor to always predict
|
||||
correctly.
|
||||
|
||||
\item{} In reality, there is an intermediary step between instructions and
|
||||
\uops{}: macro-ops. Although it serves a designing and semantic
|
||||
|
|
|
@ -114,8 +114,8 @@
|
|||
|
||||
@INPROCEEDINGS{fugaku_arm,
|
||||
author={Matsuoka, Satoshi},
|
||||
booktitle={2021 Symposium on VLSI Circuits},
|
||||
title={Fugaku and A64FX: the First Exascale Supercomputer and its Innovative Arm CPU},
|
||||
booktitle={2021 Symposium on VLSI Circuits},
|
||||
title={Fugaku and A64FX: the First Exascale Supercomputer and its Innovative Arm CPU},
|
||||
year={2021},
|
||||
volume={},
|
||||
number={},
|
||||
|
@ -165,7 +165,7 @@
|
|||
|
||||
@misc{dgemm_finetune,
|
||||
title={High Performance Code Generation in MLIR: An Early Case Study
|
||||
with GEMM},
|
||||
with GEMM},
|
||||
author={Uday Bondhugula},
|
||||
year={2020},
|
||||
eprint={2003.00532},
|
||||
|
@ -206,8 +206,8 @@
|
|||
|
||||
@inproceedings{dead_uops,
|
||||
author={Ren, Xida and Moody, Logan and Taram, Mohammadkazem and Jordan, Matthew and Tullsen, Dean M. and Venkat, Ashish},
|
||||
booktitle={2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)},
|
||||
title={I See Dead µops: Leaking Secrets via Intel/AMD Micro-Op Caches},
|
||||
booktitle={2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)},
|
||||
title={I See Dead µops: Leaking Secrets via Intel/AMD Micro-Op Caches},
|
||||
year={2021},
|
||||
volume={},
|
||||
number={},
|
||||
|
@ -230,3 +230,21 @@
|
|||
abstract = {The article discusses the features of modern processor’s microarchitecture, the method of instruction’s and micro-operation’s accelerated execution. The research focuses on the organization of the decoding stage in the CPU core pipeline and Macro- and Micro-fusion algorithms. The Macro- and Micro-fusion mechanisms are defined. A computer simulator has been developed to explore these mechanisms. The developed software has a user-friendly interface, is easy to use, and combines training and research options. The computer simulator demonstrates the sequence of mechanism’ s implementation; the resulting macro-or microoperations set after Macro- and Micro-fusion, and also reflects each algorithm features for different processor’s families. The software allows you to use either a pre-prepared file with Assembler (x86) code fragments as source data, or enter/change the source code fragments at your request. The main combinations of machine instructions that can be fused into a single macro-operation are considered, as well as instructions that can be decoded into fused micro-operations. The simulator can be useful both for in Computer Science & Engineering students, especially for on-line education and for researchers and General-purpose CPU cores developers.}
|
||||
}
|
||||
|
||||
@inproceedings{branch_pred_penalty,
|
||||
author={Eyerman, S. and Smith, J.E. and Eeckhout, L.},
|
||||
booktitle={2006 IEEE International Symposium on Performance Analysis of Systems and Software},
|
||||
title={Characterizing the branch misprediction penalty},
|
||||
year={2006},
|
||||
volume={},
|
||||
number={},
|
||||
pages={48-58},
|
||||
keywords={Pipelines;Delay;Performance analysis;Impedance;Length measurement;Clocks;Analytical models;Time measurement;Data analysis},
|
||||
doi={10.1109/ISPASS.2006.1620789}}
|
||||
|
||||
@misc{hardware_loops_patent,
|
||||
title={Hardware loops},
|
||||
author={Singh, Ravi P and Roth, Charles P and Overkamp, Gregory A},
|
||||
year={2004},
|
||||
month=jun # "~8",
|
||||
note={US Patent 6,748,523}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue