I THOUGHT BACKGROUND WRITING WOULD BE EASY

This commit is contained in:
Zhengyi Chen 2023-10-11 22:34:21 +01:00
parent e02b750ff0
commit 44805929f8
7 changed files with 103 additions and 2 deletions

131
tex/main.bib Normal file
View file

@ -0,0 +1,131 @@
@article{JTSE.2010.RRIP,
author = {Jaleel, Aamer and Theobald, Kevin B. and Steely, Simon C. and Emer, Joel},
title = {High Performance Cache Replacement Using Re-Reference Interval Prediction (RRIP)},
year = {2010},
issue_date = {June 2010},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {38},
number = {3},
issn = {0163-5964},
url = {https://doi.org/10.1145/1816038.1815971},
doi = {10.1145/1816038.1815971},
abstract = {Practical cache replacement policies attempt to emulate optimal replacement by predicting the re-reference interval of a cache block. The commonly used LRU replacement policy always predicts a near-immediate re-reference interval on cache hits and misses. Applications that exhibit a distant re-reference interval perform badly under LRU. Such applications usually have a working-set larger than the cache or have frequent bursts of references to non-temporal data (called scans). To improve the performance of such workloads, this paper proposes cache replacement using Re-reference Interval Prediction (RRIP). We propose Static RRIP (SRRIP) that is scan-resistant and Dynamic RRIP (DRRIP) that is both scan-resistant and thrash-resistant. Both RRIP policies require only 2-bits per cache block and easily integrate into existing LRU approximations found in modern processors. Our evaluations using PC games, multimedia, server and SPEC CPU2006 workloads on a single-core processor with a 2MB last-level cache (LLC) show that both SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 4\% and 10\% respectively. Our evaluations with over 1000 multi-programmed workloads on a 4-core CMP with an 8MB shared LLC show that SRRIP and DRRIP outperform LRU replacement on the throughput metric by an average of 7\% and 9\% respectively. We also show that RRIP outperforms LFU, the state-of the art scan-resistant replacement algorithm to-date. For the cache configurations under study, RRIP requires 2X less hardware than LRU and 2.5X less hardware than LFU.},
journal = {SIGARCH Comput. Archit. News},
month = {jun},
pages = {6071},
numpages = {12},
keywords = {thrashing, shared cache, replacement, scan resistance}
}
@INPROCEEDINGS{SYS.2021.RLR,
author={Sethumurugan, Subhash and Yin, Jieming and Sartori, John},
booktitle={2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
title={Designing a Cost-Effective Cache Replacement Policy using Machine Learning},
year={2021},
volume={},
number={},
pages={291-303},
doi={10.1109/HPCA51647.2021.00033}
}
@ARTICLE{MM.2004.ARC,
author={Megiddo, N. and Modha, D.S.},
journal={Computer},
title={Outperforming LRU with an adaptive replacement cache algorithm},
year={2004},
volume={37},
number={4},
pages={58-65},
doi={10.1109/MC.2004.1297303}
}
@article{QJPSE.2007.DIP,
author = {Qureshi, Moinuddin K. and Jaleel, Aamer and Patt, Yale N. and Steely, Simon C. and Emer, Joel},
title = {Adaptive Insertion Policies for High Performance Caching},
year = {2007},
issue_date = {May 2007},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {35},
number = {2},
issn = {0163-5964},
url = {https://doi.org/10.1145/1273440.1250709},
doi = {10.1145/1273440.1250709},
abstract = {The commonly used LRU replacement policy is susceptible to thrashing for memory-intensive workloads that have a working set greater than the available cache size. For such applications, the majority of lines traverse from the MRU position to the LRU position without receiving any cache hits, resulting in inefficient use of cache space. Cache performance can be improved if some fraction of the working set is retained in the cache so that at least that fraction of the working set can contribute to cache hits.We show that simple changes to the insertion policy can significantly reduce cache misses for memory-intensive workloads. We propose the LRU Insertion Policy (LIP) which places the incoming line in the LRU position instead of the MRU position. LIP protects the cache from thrashing and results in close to optimal hitrate for applications that have a cyclic reference pattern. We also propose the Bimodal Insertion Policy (BIP) as an enhancement of LIP that adapts to changes in the working set while maintaining the thrashing protection of LIP. We finally propose a Dynamic Insertion Policy (DIP) to choose between BIP and the traditional LRU policy depending on which policy incurs fewer misses. The proposed insertion policies do not require any change to the existing cache structure, are trivial to implement, and have a storage requirement of less than two bytes. We show that DIP reduces the average MPKI of the baseline 1MB 16-way L2 cache by 21\%, bridging two-thirds of the gap between LRU and OPT.},
journal = {SIGARCH Comput. Archit. News},
month = {jun},
pages = {381391},
numpages = {11},
keywords = {thrashing, set sampling, set dueling, replacement}
}
@INPROCEEDINGS{GWHSZ.2014.CacheReplAsMDP-QLearning,
author={Gu, Jingxiong and Wang, Wei and Huang, Aiping and Shan, Hangguan and Zhang, Zhaoyang},
booktitle={2014 IEEE International Conference on Communications (ICC)},
title={Distributed cache replacement for caching-enable base stations in cellular networks},
year={2014},
volume={},
number={},
pages={2648-2653},
doi={10.1109/ICC.2014.6883723}
}
@inproceedings {EHOFK.2020.IBM-LRUvsFIFO,
author = {Ohad Eytan and Danny Harnik and Effi Ofer and Roy Friedman and Ronen Kat},
title = {It{\textquoteright}s Time to Revisit {LRU} vs. {FIFO}},
booktitle = {12th USENIX Workshop on Hot Topics in Storage and File Systems (HotStorage 20)},
year = {2020},
url = {https://www.usenix.org/conference/hotstorage20/presentation/eytan},
publisher = {USENIX Association},
month = jul
}
@inproceedings{YQZYR.2023.FIFOwithTwist,
author = {Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, K. V.},
title = {FIFO Can Be Better than LRU: The Power of Lazy Promotion and Quick Demotion},
year = {2023},
isbn = {9798400701955},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3593856.3595887},
doi = {10.1145/3593856.3595887},
abstract = {LRU has been the basis of cache eviction algorithms for decades, with a plethora of innovations on improving LRU's miss ratio and throughput. While it is well-known that FIFO-based eviction algorithms provide significantly better throughput and scalability, they lag behind LRU on miss ratio, thus, cache efficiency.We performed a large-scale simulation study using 5307 block and web cache workloads collected in the past two decades. We find that contrary to what common wisdom suggests, some FIFO-based algorithms, such as FIFO-Reinsertion (or CLOCK), are, in fact, more efficient (have a lower miss ratio) than LRU. Moreover, we find that qick demotion --- evicting most new objects very quickly --- is critical for cache efficiency. We show that when enhanced by qick demotion, not only can state-of-the-art algorithms be more efficient, a simple FIFO-based algorithm can outperform five complex state-of-the-art in terms of miss ratio.},
booktitle = {Proceedings of the 19th Workshop on Hot Topics in Operating Systems},
pages = {7079},
numpages = {10},
location = {Providence, RI, USA},
series = {HOTOS '23}
}
@inproceedings{CDKP.1994.TreadMarks,
title = {TreadMarks: Distributed Shared Memory on Standard Workstations and Operating Systems},
author = {Cox, A.L. and Dwarkadas, S. and Keleher, P. and Zwaenepoel, Willy},
year = {1994},
abstract = {TreadMarks is a distributed shared memory (DSM) system for standard Unix systems such as SunOS and Ultrix. This paper presents a performance evaluation of TreadMarks running on Ultrix using DECstation-5000/240's that are connected by a 100-Mbps switch-based ATM LAN and a 10-Mbps Ethernet. Our objective is to determine the efficiency of a user-level DSM implementation on commercially available workstations and operating systems. We achieved good speedups on the 8-processor ATM network for Jacobi (7.4), TSP (7.2), Quicksort (6.3), and ILINK (5.7). For a slightly modified version ofWater from the SPLASH benchmark suite, we achieved only moderate speedups (4.0) due to the high communication and synchronization rate. Speedups decline on the 10-Mbps Ethernet (5.5 for Jacobi, 6.5 for TSP, 4.2 for Quicksort, 5.1 for ILINK, and 2.1 for Water), reecting the bandwidth limitations of the Ethernet. These results support the contention that, with suitable networking technology, DSM is a viable technique for parallel computation on clusters of workstations. To achieve these speedups, TreadMarks goes to great lengths to reduce the amount of communication performed to maintain memory consistency. It uses a lazy implementation of release consistency, and it allows multiple concurrent writers to modify a page, reducing the impact of false sharing. Great care was taken to minimize communication overhead. In particular, on the ATM network, we used a standard low-level protocol, AAL3/4, bypassing the TCP/IP protocol stack. Unix communication overhead, however, remains the main obstacle in the way of better performance for programs like Water. Compared to the Unix communication overhead, memory management cost (both kernel and user level) is small and wire time is negligible.},
url = {http://infoscience.epfl.ch/record/55805},
}
@article{ISS.1998.Millipede,
title = {Thread migration and its applications in distributed shared memory systems1Technion CS/LPCR Technical Report #9603, July 1996.1},
journal = {Journal of Systems and Software},
volume = {42},
number = {1},
pages = {71-87},
year = {1998},
issn = {0164-1212},
doi = {https://doi.org/10.1016/S0164-1212(98)00008-9},
url = {https://www.sciencedirect.com/science/article/pii/S0164121298000089},
author = {Ayal Itzkovitz and Assaf Schuster and Lea Shalev},
keywords = {Tread migration, Distributed shared memory, Load sharing, Virtual parallel machine},
abstract = {In this paper we describe the way thread migration can be carried in distributed shared memory (DSM) systems. We discuss the advantages of multi-threading in DSM systems and the importance of preempted dynamic thread migration. The proposed solution is implemented in MILLIPEDE: an environment for parallel programming over a network of (personal) computers. MILLIPEDE implements transparent computation migration mechanism: a mobile computation thread in a MILLIPEDE application can be suspended almost at every point during its lifetime and be resumed on another host. This mechanism can be used to better utilize system resources and improve performance by balancing the load and solving ping-pong situations of memory objects, and to provide user ownership on his workstation. We describe how some of these are implemented in the MILLIPEDE system. MILLIPEDE, including its thread migration module, is fully implemented in user-mode (currently on Windows-NT) using the standard operating system APIs.}
}
@inproceedings{de2000effect,
title={The effect of contention on the scalability of page-based software shared memory systems},
author={de Lara, Eyal and Lu, Honghui and Charlie, Y and Cox, Alan L and Zwaenepoel, Willy},
booktitle={Languages, Compilers, and Run-Time Systems for Scalable Computers: 5th International Workshop, LCR 2000 Rochester, NY, USA, May 25--27, 2000 Selected Papers 5},
pages={155--169},
year={2000},
organization={Springer}
}