From 8c0e5e47101c719c8c6092b3787be115c0ba823e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Tue, 18 Jun 2024 12:06:42 +0200 Subject: [PATCH] Parametric frontend: add Fabrice's suggestions --- manuscrit/40_A72-frontend/50_future_works.tex | 34 ++++++++++++++----- manuscrit/biblio/misc.bib | 28 ++++++++++++--- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/manuscrit/40_A72-frontend/50_future_works.tex b/manuscrit/40_A72-frontend/50_future_works.tex index 0c36de1..6b59170 100644 --- a/manuscrit/40_A72-frontend/50_future_works.tex +++ b/manuscrit/40_A72-frontend/50_future_works.tex @@ -92,10 +92,10 @@ may prove to be a huge frontend slowdown, especially when such instructions cross an instruction cache line boundary~\cite{uica}. Processors implementing ISAs subject to decoding bottleneck typically also -feature a decoded \uop{} cache. The typical hit rate of this cache is about -80\%~\cites[Section -B.5.7.2]{ref:intel64_software_dev_reference_vol1}{dead_uops}. However, -code analyzers are concerned with loops and, more generally, hot code portions. +feature a decoded \uop{} cache, or \emph{decoded stream buffer} (DSB). The +typical hit rate of this cache is about 80\%~\cites[Section +B.5.7.2]{ref:intel64_software_dev_reference_vol1}{dead_uops}. However, code +analyzers are concerned with loops and, more generally, hot code portions. Under such conditions, we expect this cache, once hot in steady-state, to be very close to a 100\% hit rate. In this case, only the dispatch throughput will be limiting, and modeling the decoding bottlenecks becomes irrelevant. @@ -109,12 +109,30 @@ be investigated if the model does not reach the expected accuracy. \begin{itemize} - \item{} Intel CPUs use a Loop Stream Detector (LSD) to keep - in the decode queue a whole loop's body of \uops{} if the frontend detects that a + \item{} We introduced just above the DSB (\uop{} cache). This model + considers that the DSB will never be the cause of a bottleneck and + that, instead, the number of dispatched \uops{} per cycle will always + bottleneck before. This might not be true, as DSBs are complex in + themselves already~\cite{uica}. + + \item{} Intel CPUs use a Loop Stream Detector (LSD) to keep in the decode + queue a whole loop's body of \uops{} if the frontend detects that a small enough loop is repeated~\cite{uica, dead_uops}. In this case, \uops{} are repeatedly streamed from the decode queue, without even the - necessity to hit a cache. We are unaware of - other architectures with such a feature. + necessity to hit a cache. We are unaware of similar features in other + commercial processors. In embedded programming, however, \emph{hardware + loops} --~which are set up explicitly by the programmer~-- achieve, + among others, the same goal~\cite{hardware_loops_patent}. + + \item{} The \emph{branch predictor} of a CPU is responsible for guessing, + before the actual logic is computed, whether a conditional jump will be + taken. A misprediction forces the frontend to re-populate its queues + with instructions from the branch actually taken and typically stalls + the pipeline for several cycles~\cite{branch_pred_penalty}. Our model, + however, does not include a branch predictor for much the same reason + that it does not include complex decoder: in steady-state, in a hot + code portion, we expect the branch predictor to always predict + correctly. \item{} In reality, there is an intermediary step between instructions and \uops{}: macro-ops. Although it serves a designing and semantic diff --git a/manuscrit/biblio/misc.bib b/manuscrit/biblio/misc.bib index b5e8849..a37713a 100644 --- a/manuscrit/biblio/misc.bib +++ b/manuscrit/biblio/misc.bib @@ -114,8 +114,8 @@ @INPROCEEDINGS{fugaku_arm, author={Matsuoka, Satoshi}, - booktitle={2021 Symposium on VLSI Circuits}, - title={Fugaku and A64FX: the First Exascale Supercomputer and its Innovative Arm CPU}, + booktitle={2021 Symposium on VLSI Circuits}, + title={Fugaku and A64FX: the First Exascale Supercomputer and its Innovative Arm CPU}, year={2021}, volume={}, number={}, @@ -165,7 +165,7 @@ @misc{dgemm_finetune, title={High Performance Code Generation in MLIR: An Early Case Study - with GEMM}, + with GEMM}, author={Uday Bondhugula}, year={2020}, eprint={2003.00532}, @@ -206,8 +206,8 @@ @inproceedings{dead_uops, author={Ren, Xida and Moody, Logan and Taram, Mohammadkazem and Jordan, Matthew and Tullsen, Dean M. and Venkat, Ashish}, - booktitle={2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)}, - title={I See Dead µops: Leaking Secrets via Intel/AMD Micro-Op Caches}, + booktitle={2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)}, + title={I See Dead µops: Leaking Secrets via Intel/AMD Micro-Op Caches}, year={2021}, volume={}, number={}, @@ -230,3 +230,21 @@ abstract = {The article discusses the features of modern processor’s microarchitecture, the method of instruction’s and micro-operation’s accelerated execution. The research focuses on the organization of the decoding stage in the CPU core pipeline and Macro- and Micro-fusion algorithms. The Macro- and Micro-fusion mechanisms are defined. A computer simulator has been developed to explore these mechanisms. The developed software has a user-friendly interface, is easy to use, and combines training and research options. The computer simulator demonstrates the sequence of mechanism’ s implementation; the resulting macro-or microoperations set after Macro- and Micro-fusion, and also reflects each algorithm features for different processor’s families. The software allows you to use either a pre-prepared file with Assembler (x86) code fragments as source data, or enter/change the source code fragments at your request. The main combinations of machine instructions that can be fused into a single macro-operation are considered, as well as instructions that can be decoded into fused micro-operations. The simulator can be useful both for in Computer Science & Engineering students, especially for on-line education and for researchers and General-purpose CPU cores developers.} } +@inproceedings{branch_pred_penalty, + author={Eyerman, S. and Smith, J.E. and Eeckhout, L.}, + booktitle={2006 IEEE International Symposium on Performance Analysis of Systems and Software}, + title={Characterizing the branch misprediction penalty}, + year={2006}, + volume={}, + number={}, + pages={48-58}, + keywords={Pipelines;Delay;Performance analysis;Impedance;Length measurement;Clocks;Analytical models;Time measurement;Data analysis}, + doi={10.1109/ISPASS.2006.1620789}} + +@misc{hardware_loops_patent, + title={Hardware loops}, + author={Singh, Ravi P and Roth, Charles P and Overkamp, Gregory A}, + year={2004}, + month=jun # "~8", + note={US Patent 6,748,523} +}