author = {Wang, Qian and Zhang, Xianyi and Zhang, Yunquan and Yi, Qing},
title = {AUGEM: Automatically Generate High Performance Dense Linear Algebra Kernels on X86 CPUs},
year = {2013},
isbn = {9781450323789},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2503210.2503219},
doi = {10.1145/2503210.2503219},
abstract = {Basic Liner algebra subprograms (BLAS) is a fundamental library in scientific computing. In this paper, we present a template-based optimization framework, AUGEM, which can automatically generate fully optimized assembly code for several dense linear algebra (DLA) kernels, such as GEMM, GEMV, AXPY and DOT, on varying multi-core CPUs without requiring any manual interference from developers. In particular, based on domain-specific knowledge about algorithms of the DLA kernels, we use a collection of parameterized code templates to formulate a number of commonly occurring instruction sequences within the optimized low-level C code of these DLA kernels. Then, our framework uses a specialized low-level C optimizer to identify instruction sequences that match the pre-defined code templates and thereby translates them into extremely efficient SSE/AVX instructions. The DLA kernels generated by our template-based approach surpass the implementations of Intel MKL and AMD ACML BLAS libraries, on both Intel Sandy Bridge and AMD Piledriver processors.},
booktitle = {Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis},
articleno = {25},
numpages = {12},
keywords = {auto-tuning, code generation, DLA code optimization},