I'm fighting for my life over here!
This commit is contained in:
parent
44805929f8
commit
9ce717a313
4 changed files with 236 additions and 0 deletions
89
tex/misc/background_draft.bib
Normal file
89
tex/misc/background_draft.bib
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
@article{Jaleel_etal.RRIP.2010,
|
||||||
|
title={High performance cache replacement using re-reference interval prediction (RRIP)},
|
||||||
|
author={Jaleel, Aamer and Theobald, Kevin B and Steely Jr, Simon C and Emer, Joel},
|
||||||
|
journal={ACM SIGARCH computer architecture news},
|
||||||
|
volume={38},
|
||||||
|
number={3},
|
||||||
|
pages={60--71},
|
||||||
|
year={2010},
|
||||||
|
publisher={ACM New York, NY, USA}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{Yang_etal.FIFO-LPQD.2023,
|
||||||
|
title={FIFO can be Better than LRU: the Power of Lazy Promotion and Quick Demotion},
|
||||||
|
author={Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, KV},
|
||||||
|
booktitle={Proceedings of the 19th Workshop on Hot Topics in Operating Systems},
|
||||||
|
pages={70--79},
|
||||||
|
year={2023}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{Shan_Tsai_Zhang.DSPM.2017,
|
||||||
|
author = {Shan, Yizhou and Tsai, Shin-Yeh and Zhang, Yiying},
|
||||||
|
title = {Distributed Shared Persistent Memory},
|
||||||
|
year = {2017},
|
||||||
|
isbn = {9781450350280},
|
||||||
|
publisher = {Association for Computing Machinery},
|
||||||
|
address = {New York, NY, USA},
|
||||||
|
url = {https://doi.org/10.1145/3127479.3128610},
|
||||||
|
doi = {10.1145/3127479.3128610},
|
||||||
|
abstract = {Next-generation non-volatile memories (NVMs) will provide byte addressability, persistence, high density, and DRAM-like performance. They have the potential to benefit many datacenter applications. However, most previous research on NVMs has focused on using them in a single machine environment. It is still unclear how to best utilize them in distributed, datacenter environments.We introduce Distributed Shared Persistent Memory (DSPM), a new framework for using persistent memories in distributed data-center environments. DSPM provides a new abstraction that allows applications to both perform traditional memory load and store instructions and to name, share, and persist their data.We built Hotpot, a kernel-level DSPM system that provides low-latency, transparent memory accesses, data persistence, data reliability, and high availability. The key ideas of Hotpot are to integrate distributed memory caching and data replication techniques and to exploit application hints. We implemented Hotpot in the Linux kernel and demonstrated its benefits by building a distributed graph engine on Hotpot and porting a NoSQL database to Hotpot. Our evaluation shows that Hotpot outperforms a recent distributed shared memory system by 1.3\texttimes{} to 3.2\texttimes{} and a recent distributed PM-based file system by 1.5\texttimes{} to 3.0\texttimes{}.},
|
||||||
|
booktitle = {Proceedings of the 2017 Symposium on Cloud Computing},
|
||||||
|
pages = {323–337},
|
||||||
|
numpages = {15},
|
||||||
|
keywords = {distributed shared memory, persistent memory},
|
||||||
|
location = {Santa Clara, California},
|
||||||
|
series = {SoCC '17}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{LaRowe_Ellis.Repl_NUMA.1991,
|
||||||
|
title = {Page placement policies for NUMA multiprocessors},
|
||||||
|
journal = {Journal of Parallel and Distributed Computing},
|
||||||
|
volume = {11},
|
||||||
|
number = {2},
|
||||||
|
pages = {112-129},
|
||||||
|
year = {1991},
|
||||||
|
issn = {0743-7315},
|
||||||
|
doi = {https://doi.org/10.1016/0743-7315(91)90117-R},
|
||||||
|
url = {https://www.sciencedirect.com/science/article/pii/074373159190117R},
|
||||||
|
author = {Richard P. LaRowe and Carla Schlatter Ellis},
|
||||||
|
abstract = {In many parallel applications, the size of the program's data exceeds even the very large amount of main memory available on large-scale multiprocessors. Virtual memory, in the sense of a transparent management of the main/secondary memory hierarchy, is a natural solution. The replacement, fetch, and placement policies used in uniprocessor paging systems need to be reexamined in light of the differences in the behavior of parallel computations and in the memory architectures of multiprocessors. In particular, we investigate the impact of page placement in nonuniform memory access time (NUMA) shared memory MIMD machines. We experimentally evaluate several paging algorithms that incorporate different approaches to the placement issue. Under certain workload assumptions, our results show that placement algorithms that are strongly biased toward local frame allocation but are able to borrow remote frames can reduce the number of page faults over strictly local allocation. The increased cost of memory operations due to the extra remote accesses is more than compensated for by the savings resulting from the reduction in demand fetches, effectively reducing the computation completion time for these programs without having adverse effects on the performance of “typical” NUMA programs. We also discuss some early results obtained from an actual kernel implementation of one of our page placement algorithms.}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{Aguilar_Leiss.Coherence-Replacement.2006,
|
||||||
|
author = {J. Aguilar and E.L. Leiss},
|
||||||
|
title = {A Coherence-Replacement Protocol For Web Proxy Cache Systems},
|
||||||
|
journal = {International Journal of Computers and Applications},
|
||||||
|
volume = {28},
|
||||||
|
number = {1},
|
||||||
|
pages = {12-18},
|
||||||
|
year = {2006},
|
||||||
|
publisher = {Taylor & Francis},
|
||||||
|
doi = {10.1080/1206212X.2006.11441783},
|
||||||
|
|
||||||
|
|
||||||
|
URL = {
|
||||||
|
|
||||||
|
https://doi.org/10.1080/1206212X.2006.11441783
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
},
|
||||||
|
eprint = {
|
||||||
|
|
||||||
|
https://doi.org/10.1080/1206212X.2006.11441783
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{Masouros_etal.Adrias.2023,
|
||||||
|
title={Adrias: Interference-Aware Memory Orchestration for Disaggregated Cloud Infrastructures},
|
||||||
|
author={Masouros, Dimosthenis and Pinto, Christian and Gazzetti, Michele and Xydis, Sotirios and Soudris, Dimitrios},
|
||||||
|
booktitle={2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
|
||||||
|
pages={855--869},
|
||||||
|
year={2023},
|
||||||
|
organization={IEEE}
|
||||||
|
}
|
||||||
|
|
||||||
BIN
tex/misc/background_draft.pdf
Normal file
BIN
tex/misc/background_draft.pdf
Normal file
Binary file not shown.
142
tex/misc/background_draft.tex
Normal file
142
tex/misc/background_draft.tex
Normal file
|
|
@ -0,0 +1,142 @@
|
||||||
|
\documentclass{article}
|
||||||
|
\usepackage{biblatex}
|
||||||
|
|
||||||
|
\addbibresource{background_draft.bib}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
% \chapter{Backgrounds}
|
||||||
|
Recent studies has shown a reinvigorated interest in disaggregted/distributed
|
||||||
|
shared memory systems since the 1990s. While large-scale cluster systems
|
||||||
|
predominantly make up the mainstream
|
||||||
|
The interplay between (page) replacement policy and runtime performance of
|
||||||
|
distributed shared memory systems has not been properly explored.
|
||||||
|
|
||||||
|
\section{Overview of Distributed Shared Memory}
|
||||||
|
|
||||||
|
A striking feature in the study of distributed shared memory (DSM) systems is the
|
||||||
|
non-uniformity of the terminologies used to describe overlapping study interests.
|
||||||
|
The majority of contributions to DSM study come from the 1990s, for example
|
||||||
|
\textbf{[Treadmark, Millipede, Munin, Shiva, etc.]}. These DSM systems attempt to
|
||||||
|
leverage kernel system calls to allow for user-level DSM over ethernet NICs. While
|
||||||
|
these systems provide a strong theoretical basis for today's majority-software
|
||||||
|
DSM systems and applications that expose a \emph{(partitioned) global address space},
|
||||||
|
they were nevertheless constrained by the limitations in NIC transfer rate and
|
||||||
|
bandwidth, and the concept of DSM failed to take off (relative to cluster computing).
|
||||||
|
|
||||||
|
Improvement in NIC bandwidth and transfer rate allows for applications that expose
|
||||||
|
global address space, as well as RDMA technologies that leverage single-writer
|
||||||
|
protocols over hierarchical memory nodes. \textbf{[GAS and PGAS (Partitioned GAS)
|
||||||
|
technologies for example Openshmem, OpenMPI, Cray Chapel, etc. that leverage
|
||||||
|
specially-linked memory sections and \texttt{/dev/shm} to abstract away RDMA access]}.
|
||||||
|
|
||||||
|
|
||||||
|
Contemporary works on DSM systems focus more on leveraging hardware advancements
|
||||||
|
to provide fast and/or seamless software support. Adrias \cite{Masouros_etal.Adrias.2023},
|
||||||
|
for example, implements a complex system for memory disaggregation over multiple
|
||||||
|
compute nodes connected via the \textit{ThymesisFlow}-based RDMA fabric, where
|
||||||
|
they observed significant performance improvements over existing data-intensive
|
||||||
|
processing frameworks, for example APACHE Spark, Memcached, and Redis, over
|
||||||
|
no-disaggregation (i.e., using node-local memory only, similar to cluster computing)
|
||||||
|
systems.
|
||||||
|
|
||||||
|
\subsection{Move Data to Process, or Move Process to Data?}
|
||||||
|
(TBD -- The former is costly for data-intensive computation, but the latter may
|
||||||
|
be impossible for certain tasks, and greatly hardens the replacement problem.)
|
||||||
|
|
||||||
|
\section{Replacement Policy}
|
||||||
|
|
||||||
|
In general, three variants of replacement strategies have been proposed for either
|
||||||
|
generic cache block replacement problems, or specific use-cases where contextual
|
||||||
|
factors can facilitate more efficient cache resource allocation:
|
||||||
|
\begin{itemize}
|
||||||
|
\item General-Purpose Replacement Algorithms, for example LRU.
|
||||||
|
\item Cost-Model Analysis
|
||||||
|
\item Probabilistic and Learned Algorithms
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\subsection{General-Purpose Replacement Algorithms}
|
||||||
|
Practically speaking, in the general case of the cache replacement problem,
|
||||||
|
we desire to predict the re-reference interval of a cache block
|
||||||
|
\cite{Jaleel_etal.RRIP.2010}. This follows from the Belady's algorithm -- the
|
||||||
|
optimal case for the \emph{ideal} replacement problem occurs when, at eviction
|
||||||
|
time, the entry with the highest re-reference interval is replaced. Under this
|
||||||
|
framework, therefore, the commonly-used LRU algorithm could be seen as a heuristic
|
||||||
|
where the re-reference interval for each entry is predicted to be immediate.
|
||||||
|
Fortunately, memory access traces of real computer systems agree with this
|
||||||
|
tendency due to spatial locality \textbf{[source]}. (Real systems are complex,
|
||||||
|
however, and there are other behaviors...) On the other hand, the hypothetical
|
||||||
|
LFU algorithm is a heuristic that captures frequency. \textbf{[\dots]} While the
|
||||||
|
textbook LFU algorithm suffers from needing to maintain a priority-queue for
|
||||||
|
frequency analysis, it was nevertheless useful for keeping recurrent (though
|
||||||
|
non-recent) blocks from being evicted from the cache \textbf{[source]}.
|
||||||
|
|
||||||
|
Derivatives from the LRU algorithm attempts to balance between frequency and
|
||||||
|
recency. \textbf{[Talk about LRU-K, LRU-2Q, LRU-MQ, LIRS, ARC here \dots]}
|
||||||
|
|
||||||
|
Advancements in parallel/concurrent systems had led to a rediscovery of the benefits
|
||||||
|
of using FIFO-derived replacement policies over their LRU/LFU counterparts, as
|
||||||
|
book-keeping operations on the uniform LRU/LFU state proves to be (1) difficult
|
||||||
|
for synchronization and, relatedly, (2) cache-unfriendly \cite{Yang_etal.FIFO-LPQD.2023}.
|
||||||
|
\textbf{[Talk about FIFO, FIFO-CLOCK, FIFO-CAR, FIFO-QuickDemotion, and Dueling
|
||||||
|
CLOCK here \dots]}
|
||||||
|
|
||||||
|
Finally, real-life experiences have shown the need to reduce CPU time in practical
|
||||||
|
applications, owing from one simple observation -- during the fetch-execution
|
||||||
|
cycle, all processors perform blocking I/O on the memory. A cache-unfriendly
|
||||||
|
design, despite its hypothetical optimality, could nevertheless degrade the performance
|
||||||
|
of a system during low-memory situations. In fact, this proves to be the driving
|
||||||
|
motivation behind Linux's transition away from the old LRU-2Q page replacement
|
||||||
|
algorithm into the more coarse-grained Multi-generation LRU algorithm, which has
|
||||||
|
been mainlined since v6.1.
|
||||||
|
|
||||||
|
\subsection{Cost-Model Analysis}
|
||||||
|
The ideal case for the replacement problem fails to account for invalidation of
|
||||||
|
cache entries. It also assumes for a uniform, dual-hierarchical cache-store model
|
||||||
|
that is insufficient to capture the heterogeneity of today's massively-parallel,
|
||||||
|
distributed systems. High-speed network interfaces are capable of exposing RDMA
|
||||||
|
interfaces between computer nodes, which amount to almost twice as fast RDMA transfer
|
||||||
|
when compared to swapping over the kernel I/O stack, while software that bypass
|
||||||
|
the kernel I/O stack is capable of stretching the bandwidth advantage even more
|
||||||
|
(source). This creates an interesting network topology between RDMA-enabled nodes,
|
||||||
|
where, in addition to swapping at low-memory situations, the node may opt to ``swap''
|
||||||
|
or simply drop the physical page in order to lessen the cost of page misses.
|
||||||
|
|
||||||
|
\textbf{[Talk about GreedyDual, GDSF, BCL, Amortization]}
|
||||||
|
|
||||||
|
Traditionally, replacement policies based on cost-model analysis were utilized in
|
||||||
|
content-delivery networks, which had different consistency models compared to
|
||||||
|
finer-grained systems. HTTP servers need not pertain to strong consistency models,
|
||||||
|
as out-of-date information is considered permissible, and single-writer scenarios
|
||||||
|
are common. Consequently, most replacement policies for static content servers,
|
||||||
|
while making strong distinction towards network topology, fails to concern for the
|
||||||
|
cases where an entry might become invalidated, let along multi-writer protocols.
|
||||||
|
One early paper \cite{LaRowe_Ellis.Repl_NUMA.1991} examines the efficacy of using
|
||||||
|
page fault frequency as an indicator of preference towards working set inclusion
|
||||||
|
(which I personally think is highly flawed -- to be explained). Another paper
|
||||||
|
\cite{Aguilar_Leiss.Coherence-Replacement.2006} explores the possibility of taking
|
||||||
|
page fault into consideration for eviction, but fails to go beyond the obvious
|
||||||
|
implication that pages that have been faulted \emph{must} be evicted.
|
||||||
|
|
||||||
|
The concept of cost models for RDMA and NUMA systems are relatively underdeveloped,
|
||||||
|
too. (Expand)
|
||||||
|
|
||||||
|
\subsection{Probabilistic and Learned Algorithms for Cache Replacement}
|
||||||
|
Finally, machine learning techniques and low-cost probabilistic approaches have
|
||||||
|
been applied on the ideal cache replacement problem with some level of success.
|
||||||
|
\textbf{[Talk about LeCaR, CACHEUS here]}.
|
||||||
|
|
||||||
|
\section{Cache Coherence and Consistency in DSM Systems}
|
||||||
|
|
||||||
|
(I need to read more into this. Most of the contribution comes from CPU caches,
|
||||||
|
less so for DSM systems.) \textbf{[Talk about JIAJIA and Treadmark's coherence
|
||||||
|
protocol.]}
|
||||||
|
|
||||||
|
Consistency and communication protocols naturally affect the cost for each faulted
|
||||||
|
memory access \dots
|
||||||
|
|
||||||
|
\textbf{[Talk about directory, transactional, scope, and library cache coherence,
|
||||||
|
which allow for multi-casted communications at page fault but all with different
|
||||||
|
levels of book-keeping.]}
|
||||||
|
|
||||||
|
\printbibliography
|
||||||
|
\end{document}
|
||||||
|
|
@ -145,6 +145,11 @@ package or the newer \texttt{biblatex} system.
|
||||||
These examples use a numerical citation style. You may use any consistent
|
These examples use a numerical citation style. You may use any consistent
|
||||||
reference style that you prefer, including ``(Author, Year)'' citations.
|
reference style that you prefer, including ``(Author, Year)'' citations.
|
||||||
|
|
||||||
|
\chapter{Backgrounds}
|
||||||
|
|
||||||
|
This section provides a overview into
|
||||||
|
|
||||||
|
|
||||||
\chapter{Your next chapter}
|
\chapter{Your next chapter}
|
||||||
|
|
||||||
A dissertation usually contains several chapters.
|
A dissertation usually contains several chapters.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue