I THOUGHT BACKGROUND WRITING WOULD BE EASY

2023-10-11 22:34:21 +01:00 · 2023-10-11 22:34:21 +01:00 · 44805929f8
commit 44805929f8
parent e02b750ff0
7 changed files with 103 additions and 2 deletions
--- a/tex/main.bib
+++ b/tex/main.bib
@ -0,0 +1,131 @@
+@article{JTSE.2010.RRIP,
+author = {Jaleel, Aamer and Theobald, Kevin B. and Steely, Simon C. and Emer, Joel},
+title = {High Performance Cache Replacement Using Re-Reference Interval Prediction (RRIP)},
+year = {2010},
+issue_date = {June 2010},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {38},
+number = {3},
+issn = {0163-5964},
+url = {https://doi.org/10.1145/1816038.1815971},
+doi = {10.1145/1816038.1815971},
+abstract = {Practical cache replacement policies attempt to emulate optimal replacement by predicting the re-reference interval of a cache block. The commonly used LRU replacement policy always predicts a near-immediate re-reference interval on cache hits and misses. Applications that exhibit a distant re-reference interval perform badly under LRU. Such applications usually have a working-set larger than the cache or have frequent bursts of references to non-temporal data (called scans). To improve the performance of such workloads, this paper proposes cache replacement using Re-reference Interval Prediction (RRIP). We propose Static RRIP (SRRIP) that is scan-resistant and Dynamic RRIP (DRRIP) that is both scan-resistant and thrash-resistant. Both RRIP policies require only 2-bits per cache block and easily integrate into existing LRU approximations found in modern processors. Our evaluations using PC games, multimedia, server and SPEC CPU2006 workloads on a single-core processor with a 2MB last-level cache (LLC) show that both SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 4\% and 10\% respectively. Our evaluations with over 1000 multi-programmed workloads on a 4-core CMP with an 8MB shared LLC show that SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 7\% and 9\% respectively. We also show that RRIP outperforms LFU, the state-of the art scan-resistant replacement algorithm to-date. For the cache configurations under study, RRIP requires 2X less hardware than LRU and 2.5X less hardware than LFU.},
+journal = {SIGARCH Comput. Archit. News},
+month = {jun},
+pages = {60–71},
+numpages = {12},
+keywords = {thrashing, shared cache, replacement, scan resistance}
+}
+
+@INPROCEEDINGS{SYS.2021.RLR,
+  author={Sethumurugan, Subhash and Yin, Jieming and Sartori, John},
+  booktitle={2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
+  title={Designing a Cost-Effective Cache Replacement Policy using Machine Learning},
+  year={2021},
+  volume={},
+  number={},
+  pages={291-303},
+  doi={10.1109/HPCA51647.2021.00033}
+}
+
+@ARTICLE{MM.2004.ARC,
+  author={Megiddo, N. and Modha, D.S.},
+  journal={Computer},
+  title={Outperforming LRU with an adaptive replacement cache algorithm},
+  year={2004},
+  volume={37},
+  number={4},
+  pages={58-65},
+  doi={10.1109/MC.2004.1297303}
+}
+
+@article{QJPSE.2007.DIP,
+author = {Qureshi, Moinuddin K. and Jaleel, Aamer and Patt, Yale N. and Steely, Simon C. and Emer, Joel},
+title = {Adaptive Insertion Policies for High Performance Caching},
+year = {2007},
+issue_date = {May 2007},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {35},
+number = {2},
+issn = {0163-5964},
+url = {https://doi.org/10.1145/1273440.1250709},
+doi = {10.1145/1273440.1250709},
+abstract = {The commonly used LRU replacement policy is susceptible to thrashing for memory-intensive workloads that have a working set greater than the available cache size. For such applications, the majority of lines traverse from the MRU position to the LRU position without receiving any cache hits, resulting in inefficient use of cache space. Cache performance can be improved if some fraction of the working set is retained in the cache so that at least that fraction of the working set can contribute to cache hits.We show that simple changes to the insertion policy can significantly reduce cache misses for memory-intensive workloads. We propose the LRU Insertion Policy (LIP) which places the incoming line in the LRU position instead of the MRU position. LIP protects the cache from thrashing and results in close to optimal hitrate for applications that have a cyclic reference pattern. We also propose the Bimodal Insertion Policy (BIP) as an enhancement of LIP that adapts to changes in the working set while maintaining the thrashing protection of LIP. We finally propose a Dynamic Insertion Policy (DIP) to choose between BIP and the traditional LRU policy depending on which policy incurs fewer misses. The proposed insertion policies do not require any change to the existing cache structure, are trivial to implement, and have a storage requirement of less than two bytes. We show that DIP reduces the average MPKI of the baseline 1MB 16-way L2 cache by 21\%, bridging two-thirds of the gap between LRU and OPT.},
+journal = {SIGARCH Comput. Archit. News},
+month = {jun},
+pages = {381–391},
+numpages = {11},
+keywords = {thrashing, set sampling, set dueling, replacement}
+}
+
+@INPROCEEDINGS{GWHSZ.2014.CacheReplAsMDP-QLearning,
+  author={Gu, Jingxiong and Wang, Wei and Huang, Aiping and Shan, Hangguan and Zhang, Zhaoyang},
+  booktitle={2014 IEEE International Conference on Communications (ICC)},
+  title={Distributed cache replacement for caching-enable base stations in cellular networks},
+  year={2014},
+  volume={},
+  number={},
+  pages={2648-2653},
+  doi={10.1109/ICC.2014.6883723}
+}
+
+@inproceedings {EHOFK.2020.IBM-LRUvsFIFO,
+author = {Ohad Eytan and Danny Harnik and Effi Ofer and Roy Friedman and Ronen Kat},
+title = {It{\textquoteright}s Time to Revisit {LRU} vs. {FIFO}},
+booktitle = {12th USENIX Workshop on Hot Topics in Storage and File Systems (HotStorage 20)},
+year = {2020},
+url = {https://www.usenix.org/conference/hotstorage20/presentation/eytan},
+publisher = {USENIX Association},
+month = jul
+}
+
+@inproceedings{YQZYR.2023.FIFOwithTwist,
+author = {Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, K. V.},
+title = {FIFO Can Be Better than LRU: The Power of Lazy Promotion and Quick Demotion},
+year = {2023},
+isbn = {9798400701955},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3593856.3595887},
+doi = {10.1145/3593856.3595887},
+abstract = {LRU has been the basis of cache eviction algorithms for decades, with a plethora of innovations on improving LRU's miss ratio and throughput. While it is well-known that FIFO-based eviction algorithms provide significantly better throughput and scalability, they lag behind LRU on miss ratio, thus, cache efficiency.We performed a large-scale simulation study using 5307 block and web cache workloads collected in the past two decades. We find that contrary to what common wisdom suggests, some FIFO-based algorithms, such as FIFO-Reinsertion (or CLOCK), are, in fact, more efficient (have a lower miss ratio) than LRU. Moreover, we find that qick demotion --- evicting most new objects very quickly --- is critical for cache efficiency. We show that when enhanced by qick demotion, not only can state-of-the-art algorithms be more efficient, a simple FIFO-based algorithm can outperform five complex state-of-the-art in terms of miss ratio.},
+booktitle = {Proceedings of the 19th Workshop on Hot Topics in Operating Systems},
+pages = {70–79},
+numpages = {10},
+location = {Providence, RI, USA},
+series = {HOTOS '23}
+}
+
+@inproceedings{CDKP.1994.TreadMarks,
+title = {TreadMarks: Distributed Shared Memory on Standard  Workstations and Operating Systems},
+author = {Cox, A.L. and Dwarkadas, S. and Keleher, P. and  Zwaenepoel, Willy},
+year = {1994},
+abstract = {TreadMarks is a distributed shared memory (DSM) system for  standard Unix systems such as SunOS and Ultrix. This paper  presents a performance evaluation of TreadMarks running on  Ultrix using DECstation-5000/240's that are connected by a  100-Mbps switch-based ATM LAN and a 10-Mbps Ethernet. Our  objective is to determine the efficiency of a user-level  DSM implementation on commercially available workstations  and operating systems. We achieved good speedups on the  8-processor ATM network for Jacobi (7.4), TSP (7.2),  Quicksort (6.3), and ILINK (5.7). For a slightly modified  version ofWater from the SPLASH benchmark suite, we  achieved only moderate speedups (4.0) due to the high  communication and synchronization rate. Speedups decline on  the 10-Mbps Ethernet (5.5 for Jacobi, 6.5 for TSP, 4.2 for  Quicksort, 5.1 for ILINK, and 2.1 for Water), reecting the  bandwidth limitations of the Ethernet. These results  support the contention that, with suitable networking  technology, DSM is a viable technique for parallel  computation on clusters of workstations. To achieve these  speedups, TreadMarks goes to great lengths to reduce the  amount of communication performed to maintain memory  consistency. It uses a lazy implementation of release  consistency, and it allows multiple concurrent writers to  modify a page, reducing the impact of false sharing. Great  care was taken to minimize communication overhead. In  particular, on the ATM network, we used a standard  low-level protocol, AAL3/4, bypassing the TCP/IP protocol  stack. Unix communication overhead, however, remains the  main obstacle in the way of better performance for programs  like Water. Compared to the Unix communication overhead,  memory management cost (both kernel and user level) is  small and wire time is negligible.},
+url = {http://infoscience.epfl.ch/record/55805},
+}
+
+@article{ISS.1998.Millipede,
+title = {Thread migration and its applications in distributed shared memory systems1Technion CS/LPCR Technical Report #9603, July 1996.1},
+journal = {Journal of Systems and Software},
+volume = {42},
+number = {1},
+pages = {71-87},
+year = {1998},
+issn = {0164-1212},
+doi = {https://doi.org/10.1016/S0164-1212(98)00008-9},
+url = {https://www.sciencedirect.com/science/article/pii/S0164121298000089},
+author = {Ayal Itzkovitz and Assaf Schuster and Lea Shalev},
+keywords = {Tread migration, Distributed shared memory, Load sharing, Virtual parallel machine},
+abstract = {In this paper we describe the way thread migration can be carried in distributed shared memory (DSM) systems. We discuss the advantages of multi-threading in DSM systems and the importance of preempted dynamic thread migration. The proposed solution is implemented in MILLIPEDE: an environment for parallel programming over a network of (personal) computers. MILLIPEDE implements transparent computation migration mechanism: a mobile computation thread in a MILLIPEDE application can be suspended almost at every point during its lifetime and be resumed on another host. This mechanism can be used to better utilize system resources and improve performance by balancing the load and solving ping-pong situations of memory objects, and to provide user ownership on his workstation. We describe how some of these are implemented in the MILLIPEDE system. MILLIPEDE, including its thread migration module, is fully implemented in user-mode (currently on Windows-NT) using the standard operating system APIs.}
+}
+
+@inproceedings{de2000effect,
+  title={The effect of contention on the scalability of page-based software shared memory systems},
+  author={de Lara, Eyal and Lu, Honghui and Charlie, Y and Cox, Alan L and Zwaenepoel, Willy},
+  booktitle={Languages, Compilers, and Run-Time Systems for Scalable Computers: 5th International Workshop, LCR 2000 Rochester, NY, USA, May 25--27, 2000 Selected Papers 5},
+  pages={155--169},
+  year={2000},
+  organization={Springer}
+}