Initial commit -- w4 status report

2023-10-09 00:25:28 +01:00 · 2023-10-09 00:25:28 +01:00 · e02b750ff0
commit e02b750ff0
15 changed files with 1650 additions and 0 deletions
--- a/tex/misc/w4_slices_resources/DIP.Fig10.png
+++ b/tex/misc/w4_slices_resources/DIP.Fig10.png
--- a/tex/misc/w4_slices_resources/RLR.Fig3.png
+++ b/tex/misc/w4_slices_resources/RLR.Fig3.png
--- a/tex/misc/w4_slides.bib
+++ b/tex/misc/w4_slides.bib
@ -0,0 +1,131 @@
+@article{JTSE.2010.RRIP,
+author = {Jaleel, Aamer and Theobald, Kevin B. and Steely, Simon C. and Emer, Joel},
+title = {High Performance Cache Replacement Using Re-Reference Interval Prediction (RRIP)},
+year = {2010},
+issue_date = {June 2010},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {38},
+number = {3},
+issn = {0163-5964},
+url = {https://doi.org/10.1145/1816038.1815971},
+doi = {10.1145/1816038.1815971},
+abstract = {Practical cache replacement policies attempt to emulate optimal replacement by predicting the re-reference interval of a cache block. The commonly used LRU replacement policy always predicts a near-immediate re-reference interval on cache hits and misses. Applications that exhibit a distant re-reference interval perform badly under LRU. Such applications usually have a working-set larger than the cache or have frequent bursts of references to non-temporal data (called scans). To improve the performance of such workloads, this paper proposes cache replacement using Re-reference Interval Prediction (RRIP). We propose Static RRIP (SRRIP) that is scan-resistant and Dynamic RRIP (DRRIP) that is both scan-resistant and thrash-resistant. Both RRIP policies require only 2-bits per cache block and easily integrate into existing LRU approximations found in modern processors. Our evaluations using PC games, multimedia, server and SPEC CPU2006 workloads on a single-core processor with a 2MB last-level cache (LLC) show that both SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 4\% and 10\% respectively. Our evaluations with over 1000 multi-programmed workloads on a 4-core CMP with an 8MB shared LLC show that SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 7\% and 9\% respectively. We also show that RRIP outperforms LFU, the state-of the art scan-resistant replacement algorithm to-date. For the cache configurations under study, RRIP requires 2X less hardware than LRU and 2.5X less hardware than LFU.},
+journal = {SIGARCH Comput. Archit. News},
+month = {jun},
+pages = {60–71},
+numpages = {12},
+keywords = {thrashing, shared cache, replacement, scan resistance}
+}
+
+@INPROCEEDINGS{SYS.2021.RLR,
+  author={Sethumurugan, Subhash and Yin, Jieming and Sartori, John},
+  booktitle={2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
+  title={Designing a Cost-Effective Cache Replacement Policy using Machine Learning},
+  year={2021},
+  volume={},
+  number={},
+  pages={291-303},
+  doi={10.1109/HPCA51647.2021.00033}
+}
+
+@ARTICLE{MM.2004.ARC,
+  author={Megiddo, N. and Modha, D.S.},
+  journal={Computer},
+  title={Outperforming LRU with an adaptive replacement cache algorithm},
+  year={2004},
+  volume={37},
+  number={4},
+  pages={58-65},
+  doi={10.1109/MC.2004.1297303}
+}
+
+@article{QJPSE.2007.DIP,
+author = {Qureshi, Moinuddin K. and Jaleel, Aamer and Patt, Yale N. and Steely, Simon C. and Emer, Joel},
+title = {Adaptive Insertion Policies for High Performance Caching},
+year = {2007},
+issue_date = {May 2007},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {35},
+number = {2},
+issn = {0163-5964},
+url = {https://doi.org/10.1145/1273440.1250709},
+doi = {10.1145/1273440.1250709},
+abstract = {The commonly used LRU replacement policy is susceptible to thrashing for memory-intensive workloads that have a working set greater than the available cache size. For such applications, the majority of lines traverse from the MRU position to the LRU position without receiving any cache hits, resulting in inefficient use of cache space. Cache performance can be improved if some fraction of the working set is retained in the cache so that at least that fraction of the working set can contribute to cache hits.We show that simple changes to the insertion policy can significantly reduce cache misses for memory-intensive workloads. We propose the LRU Insertion Policy (LIP) which places the incoming line in the LRU position instead of the MRU position. LIP protects the cache from thrashing and results in close to optimal hitrate for applications that have a cyclic reference pattern. We also propose the Bimodal Insertion Policy (BIP) as an enhancement of LIP that adapts to changes in the working set while maintaining the thrashing protection of LIP. We finally propose a Dynamic Insertion Policy (DIP) to choose between BIP and the traditional LRU policy depending on which policy incurs fewer misses. The proposed insertion policies do not require any change to the existing cache structure, are trivial to implement, and have a storage requirement of less than two bytes. We show that DIP reduces the average MPKI of the baseline 1MB 16-way L2 cache by 21\%, bridging two-thirds of the gap between LRU and OPT.},
+journal = {SIGARCH Comput. Archit. News},
+month = {jun},
+pages = {381–391},
+numpages = {11},
+keywords = {thrashing, set sampling, set dueling, replacement}
+}
+
+@INPROCEEDINGS{GWHSZ.2014.CacheReplAsMDP-QLearning,
+  author={Gu, Jingxiong and Wang, Wei and Huang, Aiping and Shan, Hangguan and Zhang, Zhaoyang},
+  booktitle={2014 IEEE International Conference on Communications (ICC)},
+  title={Distributed cache replacement for caching-enable base stations in cellular networks},
+  year={2014},
+  volume={},
+  number={},
+  pages={2648-2653},
+  doi={10.1109/ICC.2014.6883723}
+}
+
+@inproceedings {EHOFK.2020.IBM-LRUvsFIFO,
+author = {Ohad Eytan and Danny Harnik and Effi Ofer and Roy Friedman and Ronen Kat},
+title = {It{\textquoteright}s Time to Revisit {LRU} vs. {FIFO}},
+booktitle = {12th USENIX Workshop on Hot Topics in Storage and File Systems (HotStorage 20)},
+year = {2020},
+url = {https://www.usenix.org/conference/hotstorage20/presentation/eytan},
+publisher = {USENIX Association},
+month = jul
+}
+
+@inproceedings{YQZYR.2023.FIFOwithTwist,
+author = {Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, K. V.},
+title = {FIFO Can Be Better than LRU: The Power of Lazy Promotion and Quick Demotion},
+year = {2023},
+isbn = {9798400701955},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3593856.3595887},
+doi = {10.1145/3593856.3595887},
+abstract = {LRU has been the basis of cache eviction algorithms for decades, with a plethora of innovations on improving LRU's miss ratio and throughput. While it is well-known that FIFO-based eviction algorithms provide significantly better throughput and scalability, they lag behind LRU on miss ratio, thus, cache efficiency.We performed a large-scale simulation study using 5307 block and web cache workloads collected in the past two decades. We find that contrary to what common wisdom suggests, some FIFO-based algorithms, such as FIFO-Reinsertion (or CLOCK), are, in fact, more efficient (have a lower miss ratio) than LRU. Moreover, we find that qick demotion --- evicting most new objects very quickly --- is critical for cache efficiency. We show that when enhanced by qick demotion, not only can state-of-the-art algorithms be more efficient, a simple FIFO-based algorithm can outperform five complex state-of-the-art in terms of miss ratio.},
+booktitle = {Proceedings of the 19th Workshop on Hot Topics in Operating Systems},
+pages = {70–79},
+numpages = {10},
+location = {Providence, RI, USA},
+series = {HOTOS '23}
+}
+
+@inproceedings{CDKP.1994.TreadMarks,
+title = {TreadMarks: Distributed Shared Memory on Standard  Workstations and Operating Systems},
+author = {Cox, A.L. and Dwarkadas, S. and Keleher, P. and  Zwaenepoel, Willy},
+year = {1994},
+abstract = {TreadMarks is a distributed shared memory (DSM) system for  standard Unix systems such as SunOS and Ultrix. This paper  presents a performance evaluation of TreadMarks running on  Ultrix using DECstation-5000/240's that are connected by a  100-Mbps switch-based ATM LAN and a 10-Mbps Ethernet. Our  objective is to determine the efficiency of a user-level  DSM implementation on commercially available workstations  and operating systems. We achieved good speedups on the  8-processor ATM network for Jacobi (7.4), TSP (7.2),  Quicksort (6.3), and ILINK (5.7). For a slightly modified  version ofWater from the SPLASH benchmark suite, we  achieved only moderate speedups (4.0) due to the high  communication and synchronization rate. Speedups decline on  the 10-Mbps Ethernet (5.5 for Jacobi, 6.5 for TSP, 4.2 for  Quicksort, 5.1 for ILINK, and 2.1 for Water), reecting the  bandwidth limitations of the Ethernet. These results  support the contention that, with suitable networking  technology, DSM is a viable technique for parallel  computation on clusters of workstations. To achieve these  speedups, TreadMarks goes to great lengths to reduce the  amount of communication performed to maintain memory  consistency. It uses a lazy implementation of release  consistency, and it allows multiple concurrent writers to  modify a page, reducing the impact of false sharing. Great  care was taken to minimize communication overhead. In  particular, on the ATM network, we used a standard  low-level protocol, AAL3/4, bypassing the TCP/IP protocol  stack. Unix communication overhead, however, remains the  main obstacle in the way of better performance for programs  like Water. Compared to the Unix communication overhead,  memory management cost (both kernel and user level) is  small and wire time is negligible.},
+url = {http://infoscience.epfl.ch/record/55805},
+}
+
+@article{ISS.1998.Millipede,
+title = {Thread migration and its applications in distributed shared memory systems1Technion CS/LPCR Technical Report #9603, July 1996.1},
+journal = {Journal of Systems and Software},
+volume = {42},
+number = {1},
+pages = {71-87},
+year = {1998},
+issn = {0164-1212},
+doi = {https://doi.org/10.1016/S0164-1212(98)00008-9},
+url = {https://www.sciencedirect.com/science/article/pii/S0164121298000089},
+author = {Ayal Itzkovitz and Assaf Schuster and Lea Shalev},
+keywords = {Tread migration, Distributed shared memory, Load sharing, Virtual parallel machine},
+abstract = {In this paper we describe the way thread migration can be carried in distributed shared memory (DSM) systems. We discuss the advantages of multi-threading in DSM systems and the importance of preempted dynamic thread migration. The proposed solution is implemented in MILLIPEDE: an environment for parallel programming over a network of (personal) computers. MILLIPEDE implements transparent computation migration mechanism: a mobile computation thread in a MILLIPEDE application can be suspended almost at every point during its lifetime and be resumed on another host. This mechanism can be used to better utilize system resources and improve performance by balancing the load and solving ping-pong situations of memory objects, and to provide user ownership on his workstation. We describe how some of these are implemented in the MILLIPEDE system. MILLIPEDE, including its thread migration module, is fully implemented in user-mode (currently on Windows-NT) using the standard operating system APIs.}
+}
+
+@inproceedings{de2000effect,
+  title={The effect of contention on the scalability of page-based software shared memory systems},
+  author={de Lara, Eyal and Lu, Honghui and Charlie, Y and Cox, Alan L and Zwaenepoel, Willy},
+  booktitle={Languages, Compilers, and Run-Time Systems for Scalable Computers: 5th International Workshop, LCR 2000 Rochester, NY, USA, May 25--27, 2000 Selected Papers 5},
+  pages={155--169},
+  year={2000},
+  organization={Springer}
+}
--- a/tex/misc/w4_slides.pdf
+++ b/tex/misc/w4_slides.pdf
--- a/tex/misc/w4_slides.tex
+++ b/tex/misc/w4_slides.tex
@ -0,0 +1,159 @@
+\documentclass{beamer}
+\usepackage[style=authortitle-comp]{biblatex}
+\usepackage[export]{adjustbox}
+
+\title{Progress Report: Cache Replacement, Application Performance, and Relations to DSM}
+\author{Zhengyi Chen} % Amir?
+\date{\today}
+
+\addbibresource{w4_slides.bib}
+
+\begin{document}
+% Title page
+\frame{\titlepage}
+
+% Page 1
+\begin{frame}
+    \frametitle{(Cache) Replacement Strategies}
+    \begin{itemize}
+        \item There have been significant development in (CPU) cache replacement strategies in the last decades.
+        \item e.g., RRIP(++)\footcite{JTSE.2010.RRIP} and more recently (various) ML-\textit{derived} heuristics.
+        \item Also popular is studying adequate cache replacement strategies for distributed systems (though more stagnant).
+        % There is a lack of translation between advancements in hardware and their efficacy in software.
+        % That said, this might be because they are (able to afford) machine learning techniques in dynamic replacement strategies at edge nodes...
+        \item There are many variables within each cached system (whether CPU or distributed FS, etc.) that affect which strategy is more \textit{efficient} in operation.
+        % A cached/distributed FS or CDN, for example, primarily captures frequency than recency.
+        % An operating system might juggle between both depending on the type of access -- Linux's LRU_GEN attempts to capture this difference between file descriptor
+        % accesses and just plain stack/heap/text section accesses.
+        % The replacement problem for our kernel DSM is similar -- we want to capture the working set of all in-scope processes for each node in system. The existence of
+        % swap space only complicates the matter:
+        % - Swap locally to swap file?
+        % - Swap remotely to other node's memory which our DSM might be able to do?
+        % - Swap remotely to other node's swap file?
+        % As Amir mentioned there is also the question of speed -- the replacement algorithm needs to be fast enough for the system to not stall, though the problem
+        % of selecting a replacement algorithm may not (need to) be as time-sensitive?
+        \item Moreover, different applications (e.g., threads) exhibit different access patterns which may be better served by one strategy than another.\footcite{SYS.2021.RLR}
+    \end{itemize}
+\end{frame}
+
+% Page 2
+\begin{frame}
+    \frametitle{Notable (i.e., Encountered) Strategies}
+    \begin{itemize}
+        \item LRU family
+        \item FIFO family
+        \item Adaptive Replacement Cache
+        \item CPU-LLC Intended: Dynamic Insertion Policy, Re-Reference Interval Prediction, Signature-based Hit Predictor, \dots
+        % RRIP is basically an M-bit LFU.
+        \item ML-derived: Reinforcement Learned Replacement, LeCaR, Cache Replacement Problem as Markov Decision Process\footcite{GWHSZ.2014.CacheReplAsMDP-QLearning},
+              \dots
+    \end{itemize}
+\end{frame}
+
+% Page 3
+\begin{frame}
+    \frametitle{Notable (i.e., Encountered) Strategies}
+    \begin{itemize}
+        \item The performance of replacement strategies correlate strongly to the context of their operation.
+        \item For example, LRU is theoretically better-performing than FIFO \textit{in their most textbook implementations} but recent studies
+              \footcites{EHOFK.2020.IBM-LRUvsFIFO}{YQZYR.2023.FIFOwithTwist} have shown that FIFO can outperform LRU in practice (CDNs, for example, where even cache
+              bookkeeping structures can be costly).
+        % Now it's probable that these papers are unfairly competing a more state-of-the-art FIFO-esque algorithm with a less so LRU-esque algorithm...
+        % In general:
+        \item To summarize, \textbf{The (dynamic) choice of replacement algorithm in any system is of practical concern!}
+    \end{itemize}
+\end{frame}
+
+% Page 4
+\begin{frame}
+    \frametitle{LRU \& FIFO family -- Patches and Applications}
+    \begin{itemize}
+        \item The state-of-the-art implementations of LRU or FIFO is far-cry from their textbook implementations.
+              % Also there are a LOT of them -- I can't find enough time to gather all of them RN.
+        \item This is so that they can capture both \emph{recency} and \emph{frequency}:
+              we desire to use both to predict/assume the \emph{re-reference interval} of a given entry.
+        \item e.g., Linux uses \texttt{LRU\_GEN} which is a multi-queue LRU strategy wherein each queue(generation) represents a "similar" level of access recency and is evicted in batch.
+        \item The kernel developers wanted a \emph{fast and reasonably good} replacer as opposed to an optimal one.
+              % optimality and performance should both be considered when selecting replacement strategies.
+        \item Likewise, Yang, et.al.\footcite{YQZYR.2023.FIFOwithTwist} shows that FIFO with \textit{lazy promotion} and \textit{quick demotion} outperforms textbook LRU.
+              % recall that FIFO can exploit spatial locality better than LRU particularly in systems with slow data access!
+              % i.e., algorithm performance can be constrained by system topology.
+    \end{itemize}
+    % The documentation of LRU_GEN really shows that the developers wanted the strategy itself to decide fast (as opposed to merely deciding well): the strategy itself "[tries] to profit from spatial
+    % locality"
+\end{frame}
+
+% Page 5
+\begin{frame}
+    \frametitle{\texttt{LRU\_GEN} and Access Patterns}
+    The \texttt{LRU\_GEN} algorithm specifically makes stronger protection of pages for memory accesses through PT than through FD: \
+    \begin{itemize}
+    \item Heap/Stack/Text access misses have higher cost -- executables perform blocking I/O at memory access, less likely for file access.
+    \item They are also more likely to miss, as their in-kernel dirty bits are approximated.
+    \item Finally, they can be reasonably assumed to more likely exhibit temporal locality.
+    \end{itemize}
+    Nevertheless, the algorithm is capable to dynamic adjustment on re-faults -- \textbf{the data model of programs can be file-based or object-based}.
+    % Though as we know files (i.e., blocks in which file data reside) are loaded into (kernel) memory and heap allocation can always be swapped out,
+    % so I guess object-based storage wins with less intermediate steps (e.g., filesystem calls), sans data protection.
+    The same algorithm can deviate in fault rates on different programs on the same node.
+    % i.e., any good algorithm must be able to dynamically adapt to fault rate feedbacks.
+    % However, we don't want to run them through any complex learner...
+\end{frame}
+
+% Page 6
+\begin{frame}
+    \frametitle{Machine Learning as Analytic Tool: RLR, etc.}
+    \begin{itemize}
+        \item Large distributed systems (e.g., CDNs) can afford to perform machine learning for cache replacement tasks
+              \footcite{GWHSZ.2014.CacheReplAsMDP-QLearning}: run-time is much faster than I/O so some cycles could be afforded.
+        \item For page replacement in kernel, we can't really afford to run anything costly (Amir).
+        \item ML analysis\footcite{SYS.2021.RLR} shows how different (computation-intensive) programs exhibit distinct
+              access patterns.
+    \end{itemize}
+\end{frame}
+
+% Page 7
+\begin{frame}
+    \frametitle{Machine Learning as Analytic Tool: RLR, etc.}
+    \includegraphics[height=0.6\textheight, center]{w4_slices_resources/RLR.Fig3.png}
+    \footcite{SYS.2021.RLR}
+    P.S. \textit{preuse}: set access since last access to address/line.
+\end{frame}
+
+% Page 8
+\begin{frame}
+    \frametitle{DSM, Applications, and Memory (Contention)}
+    The speedup of applications on DSM systems is negatively correlated to shared memory contention.
+
+    Take TreadMarks\footcite{CDKP.1994.TreadMarks} for example:
+    \begin{itemize}
+        \item \textit{Jacobi} is a solver for linear system of equations via the \textit{successive over-relaxation} method.
+               The memory access pattern should be map-reduce-like: the problem is parallelized w/ partial matrices for each node with immutable storage of the relevant matrices?
+               TreadMarks achieves $\sim7x$-speedup on a 8-node system over one single-core node.
+        \item \textit{Water} is a parallel $N$-body molecular dynamics simulator that requires at least $O(\frac{N}{2})$ communications per processor.
+               TreadMarks only achieves $\sim4x$-speedup with around $47\%$ time used for blocking communications.
+    \end{itemize}
+\end{frame}
+
+% Page 9
+\begin{frame}
+    \frametitle{DSM, Applications, and Memory (Contention)}
+    \begin{itemize}
+        \item It's kinda difficult to compare statistics from different DSM systems.
+        \item Even with the same programs being run, different parameters makes for different program behaviors wrt. contention, etc.
+        \item Logically speaking, the more contention on the same address, the less speedup is possible for the system\footcite{de2000effect}.
+        \item Should cache replacement strategies be aware of how contended a page may be to prevent it from e.g., being swapped out?
+    \end{itemize}
+\end{frame}
+
+% Page 10
+\begin{frame}
+    \frametitle{Hardware-based Dynamic Strategy Selection: DIP}
+    Hardware-based replacement strategies can provide low-cost inspirations for software replacement strategies.
+
+    \includegraphics[height=0.6\textheight, center]{w4_slices_resources/DIP.Fig10.png}\footcite{QJPSE.2007.DIP}
+
+    Problem: How can this be scaled for multiple strategies?
+\end{frame}
+
+\end{document}