I'm fighting for my life over here!

2023-11-08 03:31:13 +00:00 · 2023-11-08 03:31:13 +00:00 · 9ce717a313
commit 9ce717a313
parent 44805929f8
4 changed files with 236 additions and 0 deletions
--- a/tex/misc/background_draft.bib
+++ b/tex/misc/background_draft.bib
@ -0,0 +1,89 @@
@article{Jaleel_etal.RRIP.2010,
  title={High performance cache replacement using re-reference interval prediction (RRIP)},
  author={Jaleel, Aamer and Theobald, Kevin B and Steely Jr, Simon C and Emer, Joel},
  journal={ACM SIGARCH computer architecture news},
  volume={38},
  number={3},
  pages={60--71},
  year={2010},
  publisher={ACM New York, NY, USA}
 }
@inproceedings{Yang_etal.FIFO-LPQD.2023,
  title={FIFO can be Better than LRU: the Power of Lazy Promotion and Quick Demotion},
  author={Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, KV},
  booktitle={Proceedings of the 19th Workshop on Hot Topics in Operating Systems},
  pages={70--79},
  year={2023}
 }
@inproceedings{Shan_Tsai_Zhang.DSPM.2017,
 author = {Shan, Yizhou and Tsai, Shin-Yeh and Zhang, Yiying},
 title = {Distributed Shared Persistent Memory},
 year = {2017},
 isbn = {9781450350280},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {https://doi.org/10.1145/3127479.3128610},
 doi = {10.1145/3127479.3128610},
 abstract = {Next-generation non-volatile memories (NVMs) will provide byte addressability, persistence, high density, and DRAM-like performance. They have the potential to benefit many datacenter applications. However, most previous research on NVMs has focused on using them in a single machine environment. It is still unclear how to best utilize them in distributed, datacenter environments.We introduce Distributed Shared Persistent Memory (DSPM), a new framework for using persistent memories in distributed data-center environments. DSPM provides a new abstraction that allows applications to both perform traditional memory load and store instructions and to name, share, and persist their data.We built Hotpot, a kernel-level DSPM system that provides low-latency, transparent memory accesses, data persistence, data reliability, and high availability. The key ideas of Hotpot are to integrate distributed memory caching and data replication techniques and to exploit application hints. We implemented Hotpot in the Linux kernel and demonstrated its benefits by building a distributed graph engine on Hotpot and porting a NoSQL database to Hotpot. Our evaluation shows that Hotpot outperforms a recent distributed shared memory system by 1.3\texttimes{} to 3.2\texttimes{} and a recent distributed PM-based file system by 1.5\texttimes{} to 3.0\texttimes{}.},
 booktitle = {Proceedings of the 2017 Symposium on Cloud Computing},
 pages = {323–337},
 numpages = {15},
 keywords = {distributed shared memory, persistent memory},
 location = {Santa Clara, California},
 series = {SoCC '17}
 }
@article{LaRowe_Ellis.Repl_NUMA.1991,
 title = {Page placement policies for NUMA multiprocessors},
 journal = {Journal of Parallel and Distributed Computing},
 volume = {11},
 number = {2},
 pages = {112-129},
 year = {1991},
 issn = {0743-7315},
 doi = {https://doi.org/10.1016/0743-7315(91)90117-R},
 url = {https://www.sciencedirect.com/science/article/pii/074373159190117R},
 author = {Richard P. LaRowe and Carla Schlatter Ellis},
 abstract = {In many parallel applications, the size of the program's data exceeds even the very large amount of main memory available on large-scale multiprocessors. Virtual memory, in the sense of a transparent management of the main/secondary memory hierarchy, is a natural solution. The replacement, fetch, and placement policies used in uniprocessor paging systems need to be reexamined in light of the differences in the behavior of parallel computations and in the memory architectures of multiprocessors. In particular, we investigate the impact of page placement in nonuniform memory access time (NUMA) shared memory MIMD machines. We experimentally evaluate several paging algorithms that incorporate different approaches to the placement issue. Under certain workload assumptions, our results show that placement algorithms that are strongly biased toward local frame allocation but are able to borrow remote frames can reduce the number of page faults over strictly local allocation. The increased cost of memory operations due to the extra remote accesses is more than compensated for by the savings resulting from the reduction in demand fetches, effectively reducing the computation completion time for these programs without having adverse effects on the performance of “typical” NUMA programs. We also discuss some early results obtained from an actual kernel implementation of one of our page placement algorithms.}
 }
@article{Aguilar_Leiss.Coherence-Replacement.2006,
 author = {J. Aguilar and E.L. Leiss},
 title = {A Coherence-Replacement Protocol For Web Proxy Cache Systems},
 journal = {International Journal of Computers and Applications},
 volume = {28},
 number = {1},
 pages = {12-18},
 year = {2006},
 publisher = {Taylor & Francis},
 doi = {10.1080/1206212X.2006.11441783},
 URL = {
        https://doi.org/10.1080/1206212X.2006.11441783
 },
 eprint = {
        https://doi.org/10.1080/1206212X.2006.11441783
 }
 }
@inproceedings{Masouros_etal.Adrias.2023,
  title={Adrias: Interference-Aware Memory Orchestration for Disaggregated Cloud Infrastructures},
  author={Masouros, Dimosthenis and Pinto, Christian and Gazzetti, Michele and Xydis, Sotirios and Soudris, Dimitrios},
  booktitle={2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
  pages={855--869},
  year={2023},
  organization={IEEE}
 }
--- a/tex/misc/background_draft.pdf
+++ b/tex/misc/background_draft.pdf
--- a/tex/misc/background_draft.tex
+++ b/tex/misc/background_draft.tex
@ -0,0 +1,142 @@
 \documentclass{article}
 \usepackage{biblatex}
 \addbibresource{background_draft.bib}
 \begin{document}
 % \chapter{Backgrounds}
 Recent studies has shown a reinvigorated interest in disaggregted/distributed
 shared memory systems since the 1990s. While large-scale cluster systems
 predominantly make up the mainstream
 The interplay between (page) replacement policy and runtime performance of
 distributed shared memory systems has not been properly explored.
 \section{Overview of Distributed Shared Memory}
 A striking feature in the study of distributed shared memory (DSM) systems is the
 non-uniformity of the terminologies used to describe overlapping study interests.
 The majority of contributions to DSM study come from the 1990s, for example
 \textbf{[Treadmark, Millipede, Munin, Shiva, etc.]}. These DSM systems attempt to
 leverage kernel system calls to allow for user-level DSM over ethernet NICs. While
 these systems provide a strong theoretical basis for today's majority-software
 DSM systems and applications that expose a \emph{(partitioned) global address space},
 they were nevertheless constrained by the limitations in NIC transfer rate and
 bandwidth, and the concept of DSM failed to take off (relative to cluster computing).
 Improvement in NIC bandwidth and transfer rate allows for applications that expose
 global address space, as well as RDMA technologies that leverage single-writer
 protocols over hierarchical memory nodes. \textbf{[GAS and PGAS (Partitioned GAS)
 technologies for example Openshmem, OpenMPI, Cray Chapel, etc. that leverage
 specially-linked memory sections and \texttt{/dev/shm} to abstract away RDMA access]}.
 Contemporary works on DSM systems focus more on leveraging hardware advancements
 to provide fast and/or seamless software support. Adrias \cite{Masouros_etal.Adrias.2023},
 for example, implements a complex system for memory disaggregation over multiple
 compute nodes connected via the \textit{ThymesisFlow}-based RDMA fabric, where
 they observed significant performance improvements over existing data-intensive
 processing frameworks, for example APACHE Spark, Memcached, and Redis, over
 no-disaggregation (i.e., using node-local memory only, similar to cluster computing)
 systems.
 \subsection{Move Data to Process, or Move Process to Data?}
 (TBD -- The former is costly for data-intensive computation, but the latter may
 be impossible for certain tasks, and greatly hardens the replacement problem.)
 \section{Replacement Policy}
 In general, three variants of replacement strategies have been proposed for either
 generic cache block replacement problems, or specific use-cases where contextual
 factors can facilitate more efficient cache resource allocation:
 \begin{itemize}
    \item General-Purpose Replacement Algorithms, for example LRU.
    \item Cost-Model Analysis
    \item Probabilistic and Learned Algorithms
 \end{itemize}
 \subsection{General-Purpose Replacement Algorithms}
 Practically speaking, in the general case of the cache replacement problem,
 we desire to predict the re-reference interval of a cache block
 \cite{Jaleel_etal.RRIP.2010}. This follows from the Belady's algorithm -- the
 optimal case for the \emph{ideal} replacement problem occurs when, at eviction
 time, the entry with the highest re-reference interval is replaced. Under this
 framework, therefore, the commonly-used LRU algorithm could be seen as a heuristic
 where the re-reference interval for each entry is predicted to be immediate.
 Fortunately, memory access traces of real computer systems agree with this
 tendency due to spatial locality \textbf{[source]}. (Real systems are complex,
 however, and there are other behaviors...) On the other hand, the hypothetical
 LFU algorithm is a heuristic that captures frequency. \textbf{[\dots]} While the
 textbook LFU algorithm suffers from needing to maintain a priority-queue for
 frequency analysis, it was nevertheless useful for keeping recurrent (though
 non-recent) blocks from being evicted from the cache \textbf{[source]}.
 Derivatives from the LRU algorithm attempts to balance between frequency and
 recency. \textbf{[Talk about LRU-K, LRU-2Q, LRU-MQ, LIRS, ARC here \dots]}
 Advancements in parallel/concurrent systems had led to a rediscovery of the benefits
 of using FIFO-derived replacement policies over their LRU/LFU counterparts, as
 book-keeping operations on the uniform LRU/LFU state proves to be (1) difficult
 for synchronization and, relatedly, (2) cache-unfriendly \cite{Yang_etal.FIFO-LPQD.2023}.
 \textbf{[Talk about FIFO, FIFO-CLOCK, FIFO-CAR, FIFO-QuickDemotion, and Dueling
 CLOCK here \dots]}
 Finally, real-life experiences have shown the need to reduce CPU time in practical
 applications, owing from one simple observation -- during the fetch-execution
 cycle, all processors perform blocking I/O on the memory. A cache-unfriendly
 design, despite its hypothetical optimality, could nevertheless degrade the performance
 of a system during low-memory situations. In fact, this proves to be the driving
 motivation behind Linux's transition away from the old LRU-2Q page replacement
 algorithm into the more coarse-grained Multi-generation LRU algorithm, which has
 been mainlined since v6.1.
 \subsection{Cost-Model Analysis}
 The ideal case for the replacement problem fails to account for invalidation of
 cache entries. It also assumes for a uniform, dual-hierarchical cache-store model
 that is insufficient to capture the heterogeneity of today's massively-parallel,
 distributed systems. High-speed network interfaces are capable of exposing RDMA
 interfaces between computer nodes, which amount to almost twice as fast RDMA transfer
 when compared to swapping over the kernel I/O stack, while software that bypass
 the kernel I/O stack is capable of stretching the bandwidth advantage even more
 (source). This creates an interesting network topology between RDMA-enabled nodes,
 where, in addition to swapping at low-memory situations, the node may opt to ``swap''
 or simply drop the physical page in order to lessen the cost of page misses.
 \textbf{[Talk about GreedyDual, GDSF, BCL, Amortization]}
 Traditionally, replacement policies based on cost-model analysis were utilized in
 content-delivery networks, which had different consistency models compared to
 finer-grained systems. HTTP servers need not pertain to strong consistency models,
 as out-of-date information is considered permissible, and single-writer scenarios
 are common. Consequently, most replacement policies for static content servers,
 while making strong distinction towards network topology, fails to concern for the
 cases where an entry might become invalidated, let along multi-writer protocols.
 One early paper \cite{LaRowe_Ellis.Repl_NUMA.1991} examines the efficacy of using
 page fault frequency as an indicator of preference towards working set inclusion
 (which I personally think is highly flawed -- to be explained). Another paper
 \cite{Aguilar_Leiss.Coherence-Replacement.2006} explores the possibility of taking
 page fault into consideration for eviction, but fails to go beyond the obvious
 implication that pages that have been faulted \emph{must} be evicted.
 The concept of cost models for RDMA and NUMA systems are relatively underdeveloped,
 too. (Expand)
 \subsection{Probabilistic and Learned Algorithms for Cache Replacement}
 Finally, machine learning techniques and low-cost probabilistic approaches have
 been applied on the ideal cache replacement problem with some level of success.
 \textbf{[Talk about LeCaR, CACHEUS here]}.
 \section{Cache Coherence and Consistency in DSM Systems}
 (I need to read more into this. Most of the contribution comes from CPU caches,
 less so for DSM systems.) \textbf{[Talk about JIAJIA and Treadmark's coherence
 protocol.]}
 Consistency and communication protocols naturally affect the cost for each faulted
 memory access \dots
 \textbf{[Talk about directory, transactional, scope, and library cache coherence,
 which allow for multi-casted communications at page fault but all with different
 levels of book-keeping.]}
 \printbibliography
 \end{document}
--- a/tex/skeleton.tex
+++ b/tex/skeleton.tex
@ -145,6 +145,11 @@ package or the newer \texttt{biblatex} system.
 These examples use a numerical citation style. You may use any consistent
 reference style that you prefer, including ``(Author, Year)'' citations.
 \chapter{Backgrounds}
 This section provides a overview into 
 \chapter{Your next chapter}
 A dissertation usually contains several chapters.