diff --git a/manuscrit/10_introduction/main.tex b/manuscrit/10_introduction/main.tex
index 1a5c205..9c927c5 100644
--- a/manuscrit/10_introduction/main.tex
+++ b/manuscrit/10_introduction/main.tex
@@ -56,8 +56,12 @@ slower than the former~\cite{rowmajor_repo}.
 
 This, however, is still an optimization that holds for the vast majority of
 CPUs. In many cases, transformations targeting a specific microarchitecture can
-be very beneficial. \qtodo{Insert number/ref \wrt{} matmult or some kernel of
-the like.} This kind of optimizations, however, requires manual effort, and a
+be very beneficial.
+For instance, Uday Bondhugula found out that manual tuning, through many
+techniques and tools, of a general matrix multiplication could multiply its
+throughput by roughly 13.5 compared to \texttt{gcc~-O3}, or even 130 times
+faster than \texttt{clang -O3}~\cite{dgemm_finetune}.
+This kind of optimizations, however, requires manual effort, and a
 deep expert knowledge both in optimization techniques and on the specific
 architecture targeted.
 These techniques are only worth applying on the parts of a program that are
diff --git a/manuscrit/biblio/misc.bib b/manuscrit/biblio/misc.bib
index 35ccd2e..1139896 100644
--- a/manuscrit/biblio/misc.bib
+++ b/manuscrit/biblio/misc.bib
@@ -148,3 +148,13 @@
     month=10,
     howpublished={\url{https://gitlab.inria.fr/tbastian/rowmajor-measure}},
 }
+
+@misc{dgemm_finetune,
+    title={High Performance Code Generation in MLIR: An Early Case Study
+           with GEMM}, 
+    author={Uday Bondhugula},
+    year={2020},
+    eprint={2003.00532},
+    archivePrefix={arXiv},
+    primaryClass={cs.PF}
+}