@phdthesis{phd:gruber, url = "http://www.theses.fr/2019GREAM071", title = "Performance Debugging Toolbox for Binaries: Sensitivity Analysis and Dependence Profiling", author = "Gruber, Fabian", school = "Université Grenoble Alpes", year = "2019", note = "2019GREAM071", url = "http://www.theses.fr/2019GREAM071/document", } @article{kendalltau, title={A new measure of rank correlation}, author={Kendall, Maurice G}, journal={Biometrika}, volume={30}, number={1/2}, pages={81--93}, year={1938}, publisher={JSTOR} } @manual{ref:amd_zen4_optim_manual, title={Software Optimization Guide for the AMD Zen4 Microarchitecture}, organization = {Advanced Micro Devices (AMD)}, year = {2023}, month = {January}, note = {Publication number 57647}, } @manual{ref:intel64_architectures_optim_reference_vol1, title = {Intel® 64 and IA-32 Architectures Optimization Reference Manual Volume 1}, organization = {Intel Corporation}, year = {2023}, month = {September}, } @manual{ref:intel64_software_dev_reference_vol1, title = {Intel® 64 and IA-32 Architectures Software Developer’s Manual, volume 1}, organization = {Intel Corporation}, year = {2023}, month = {June}, } @manual{ref:amd64_architecture_dev_reference_vol2, title = {AMD64 Architecture Programmer’s Manual, volume 2}, organization = {AMD}, year = {2023}, month = {June}, } @incollection{grid5000, title = {Adding Virtualization Capabilities to the {Grid'5000} Testbed}, author = {Balouek, Daniel and Carpen Amarie, Alexandra and Charrier, Ghislain and Desprez, Fr{\'e}d{\'e}ric and Jeannot, Emmanuel and Jeanvoine, Emmanuel and L{\`e}bre, Adrien and Margery, David and Niclausse, Nicolas and Nussbaum, Lucas and Richard, Olivier and P{\'e}rez, Christian and Quesnel, Flavien and Rohr, Cyril and Sarzyniec, Luc}, booktitle = {Cloud Computing and Services Science}, publisher = {Springer International Publishing}, pages = {3-20}, volume = {367}, editor = {Ivanov, Ivan I. and van Sinderen, Marten and Leymann, Frank and Shan, Tony }, series = {Communications in Computer and Information Science }, isbn = {978-3-319-04518-4 }, doi = {10.1007/978-3-319-04519-1\_1 }, year = {2013}, } @article{hcluster_ward, ISSN = {01621459}, URL = {http://www.jstor.org/stable/2282967}, abstract = {A procedure for forming hierarchical groups of mutually exclusive subsets, each of which has members that are maximally similar with respect to specified characteristics, is suggested for use in large-scale ($n > 100$) studies when a precise optimal solution for a specified number of groups is not practical. Given n sets, this procedure permits their reduction to n - 1 mutually exclusive sets by considering the union of all possible n(n - 1)/2 pairs and selecting a union having a maximal value for the functional relation, or objective function, that reflects the criterion chosen by the investigator. By repeating this process until only one group remains, the complete hierarchical structure and a quantitative estimate of the loss associated with each stage in the grouping can be obtained. A general flowchart helpful in computer programming and a numerical example are included.}, author = {Joe H. Ward}, journal = {Journal of the American Statistical Association}, number = {301}, pages = {236--244}, publisher = {[American Statistical Association, Taylor & Francis, Ltd.]}, title = {Hierarchical Grouping to Optimize an Objective Function}, urldate = {2023-09-15}, volume = {58}, year = {1963} } @article{hcluster_silhouette, title = {Silhouettes: A graphical aid to the interpretation and validation of cluster analysis}, journal = {Journal of Computational and Applied Mathematics}, volume = {20}, pages = {53-65}, year = {1987}, issn = {0377-0427}, doi = {https://doi.org/10.1016/0377-0427(87)90125-7}, url = {https://www.sciencedirect.com/science/article/pii/0377042787901257}, author = {Peter J. Rousseeuw}, keywords = {Graphical display, cluster analysis, clustering validity, classification}, abstract = {A new graphical display is proposed for partitioning techniques. Each cluster is represented by a so-called silhouette, which is based on the comparison of its tightness and separation. This silhouette shows which objects lie well within their cluster, and which ones are merely somewhere in between clusters. The entire clustering is displayed by combining the silhouettes into a single plot, allowing an appreciation of the relative quality of the clusters and an overview of the data configuration. The average silhouette width provides an evaluation of clustering validity, and might be used to select an ‘appropriate’ number of clusters.} } @misc{a72_doc, title={Cortex A-72}, author={{ARM}}, howpublished={\url{https://developer.arm.com/Processors/Cortex-A72}} } @manual{ref:a72_optim, title = {Cortex-A72 Software Optimization Guide}, organization = {ARM}, year = {2015}, month = {March}, } @misc{agnerfog_skl_front4, title={Discussion on blogpost}, author={Fog, Agner}, year=2016, howpublished={\url{https://www.agner.org/optimize/blog/read.php?i=581}} } @INPROCEEDINGS{fugaku_arm, author={Matsuoka, Satoshi}, booktitle={2021 Symposium on VLSI Circuits}, title={Fugaku and A64FX: the First Exascale Supercomputer and its Innovative Arm CPU}, year={2021}, volume={}, number={}, pages={1-3}, doi={10.23919/VLSICircuits52068.2021.9492415} } @misc{fugaku_top500, title={Supercomputer Fugaku retains first place worldwide in HPCG and Graph500 rankings}, year=2023, month=05, author={{Fujitsu Limited}}, howpublished={\url{https://www.fujitsu.com/global/about/resources/news/press-releases/2022/1115-01.html}} } @misc{marenostrum4_arm, title={Technical information on the MareNostrum 4 supercomputer's ARM cluster}, author={{Barcelona Supercomputing Center}}, year=2020, howpublished={\url{https://www.bsc.es/innovation-and-services/technical-information-cte-arm}} } @misc{arm_mobile, title={Together, we are building the future of computing, on Arm}, author={Rene Haas}, organization = {ARM}, year=2023, month=September, howpublished={\url{https://www.arm.com/company/news/2023/09/building-the-future-of-computing-on-arm}}, } @misc{wikichip_intel_rob_size, title={Intel Details Golden Cove: Next-Generation Big Core For Client and Server SoCs}, author={{WikiChip}}, year=2021, month=08, howpublished={\url{https://fuse.wikichip.org/news/6111/intel-details-golden-cove-next-generation-big-core-for-client-and-server-socs/}} } @misc{rowmajor_repo, title={Rowmajor vs. colmajor experiments}, author={Bastian, Théophile}, year=2023, month=10, howpublished={\url{https://gitlab.inria.fr/tbastian/rowmajor-measure}}, } @misc{dgemm_finetune, title={High Performance Code Generation in MLIR: An Early Case Study with GEMM}, author={Uday Bondhugula}, year={2020}, eprint={2003.00532}, archivePrefix={arXiv}, primaryClass={cs.PF} } @misc{elf_tis, title={Tool interface standard (TIS) executable and linking format (ELF) specification version 1.2}, author={{TIS} Committee and others}, year={1995}, publisher={May} } @article{riscv_isa, title={The risc-v instruction set manual, volume i: Base user-level isa}, author={Waterman, Andrew and Lee, Yunsup and Patterson, David A and Asanovic, Krste}, journal={EECS Department, UC Berkeley, Tech. Rep. UCB/EECS-2011-62}, volume={116}, pages={1--32}, year={2011} } @inproceedings{filippo_riscv_vector, title={Software Development Vehicles to enable extended and early co-design: a RISC-V and HPC case of study}, author={Mantovani, Filippo and Vizcaino, Pablo and Banchelli, Fabio and Garcia-Gasulla, Marta and Ferrer, Roger and Ieronymakis, Georgios and Dimou, Nikolaos and Papaefstathiou, Vassilis and Labarta, Jesus}, booktitle={International Conference on High Performance Computing}, pages={526--537}, year={2023}, organization={Springer} } @misc{filippo_acaces23, author={Mantovani, Filippo}, year={2023}, month={July}, howpublished={Private communication during the ACACES summer school}, } @inproceedings{dead_uops, author={Ren, Xida and Moody, Logan and Taram, Mohammadkazem and Jordan, Matthew and Tullsen, Dean M. and Venkat, Ashish}, booktitle={2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)}, title={I See Dead µops: Leaking Secrets via Intel/AMD Micro-Op Caches}, year={2021}, volume={}, number={}, pages={361-374}, keywords={Program processors;Microarchitecture;Computer architecture;Timing;System-on-chip;Transient analysis}, doi={10.1109/ISCA52012.2021.00036}} @article{Vishnekov_2021, doi = {10.1088/1742-6596/1740/1/012053}, url = {https://dx.doi.org/10.1088/1742-6596/1740/1/012053}, year = {2021}, month = {jan}, publisher = {IOP Publishing}, volume = {1740}, number = {1}, pages = {012053}, author = {A V Vishnekov and E M Ivanova and N A Stepanov and N D Shaimov}, title = {A Simulation Model for Macro- and Micro-Fusion Algorithms in the CPU Core}, journal = {Journal of Physics: Conference Series}, abstract = {The article discusses the features of modern processor’s microarchitecture, the method of instruction’s and micro-operation’s accelerated execution. The research focuses on the organization of the decoding stage in the CPU core pipeline and Macro- and Micro-fusion algorithms. The Macro- and Micro-fusion mechanisms are defined. A computer simulator has been developed to explore these mechanisms. The developed software has a user-friendly interface, is easy to use, and combines training and research options. The computer simulator demonstrates the sequence of mechanism’ s implementation; the resulting macro-or microoperations set after Macro- and Micro-fusion, and also reflects each algorithm features for different processor’s families. The software allows you to use either a pre-prepared file with Assembler (x86) code fragments as source data, or enter/change the source code fragments at your request. The main combinations of machine instructions that can be fused into a single macro-operation are considered, as well as instructions that can be decoded into fused micro-operations. The simulator can be useful both for in Computer Science & Engineering students, especially for on-line education and for researchers and General-purpose CPU cores developers.} } @inproceedings{branch_pred_penalty, author={Eyerman, S. and Smith, J.E. and Eeckhout, L.}, booktitle={2006 IEEE International Symposium on Performance Analysis of Systems and Software}, title={Characterizing the branch misprediction penalty}, year={2006}, volume={}, number={}, pages={48-58}, keywords={Pipelines;Delay;Performance analysis;Impedance;Length measurement;Clocks;Analytical models;Time measurement;Data analysis}, doi={10.1109/ISPASS.2006.1620789}} @misc{hardware_loops_patent, title={Hardware loops}, author={Singh, Ravi P and Roth, Charles P and Overkamp, Gregory A}, year={2004}, month=jun # "~8", note={US Patent 6,748,523} } @misc{kavvadias2007hardware, title={Hardware looping unit}, author={Kavvadias, Nikolaos}, year={2007} } @inproceedings{talla2001hwloops, author={Talla, D. and John, L.K.}, booktitle={Proceedings 2001 IEEE International Conference on Computer Design: VLSI in Computers and Processors. ICCD 2001}, title={Cost-effective hardware acceleration of multimedia applications}, year={2001}, volume={}, number={}, pages={415-424}, keywords={Hardware;Acceleration;Computer aided instruction;Streaming media;Parallel processing;Throughput;Concurrent computing;Application software;Microprocessors;Feeds}, doi={10.1109/ICCD.2001.955060}} @article{points_to, author = {Emami, Maryam and Ghiya, Rakesh and Hendren, Laurie J.}, title = {Context-sensitive interprocedural points-to analysis in the presence of function pointers}, year = {1994}, issue_date = {June 1994}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {29}, number = {6}, issn = {0362-1340}, url = {https://doi.org/10.1145/773473.178264}, doi = {10.1145/773473.178264}, abstract = {This paper reports on the design, implementation, and empirical results of a new method for dealing with the aliasing problem in C. The method is based on approximating the points-to relationships between accessible stack locations, and can be used to generate alias pairs, or used directly for other analyses and transformations.Our method provides context-sensitive interprocedural information based on analysis over invocation graphs that capture all calling contexts including recursive and mutually-recursive calling contexts. Furthermore, the method allows the smooth integration for handling general function pointers in C.We illustrate the effectiveness of the method with empirical results from an implementation in the McCAT optimizing/parallelizing C compiler.}, journal = {SIGPLAN Not.}, month = {jun}, pages = {242–256}, numpages = {15} }