diff --git a/tex/misc/background_draft.bib b/tex/misc/background_draft.bib index e1869e9..e02225a 100644 --- a/tex/misc/background_draft.bib +++ b/tex/misc/background_draft.bib @@ -1,89 +1,278 @@ @article{Jaleel_etal.RRIP.2010, - title={High performance cache replacement using re-reference interval prediction (RRIP)}, - author={Jaleel, Aamer and Theobald, Kevin B and Steely Jr, Simon C and Emer, Joel}, - journal={ACM SIGARCH computer architecture news}, - volume={38}, - number={3}, - pages={60--71}, - year={2010}, + title = {High performance cache replacement using re-reference interval prediction (RRIP)}, + author = {Jaleel, Aamer and Theobald, Kevin B and Steely Jr, Simon C and Emer, Joel}, + year = 2010, + journal = {ACM SIGARCH computer architecture news}, + publisher = {ACM New York, NY, USA}, + volume = 38, + number = 3, + pages = {60--71} +} +@inproceedings{Yang_etal.FIFO-LPQD.2023, + title = {FIFO can be Better than LRU: the Power of Lazy Promotion and Quick Demotion}, + author = {Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, KV}, + year = 2023, + booktitle = {Proceedings of the 19th Workshop on Hot Topics in Operating Systems}, + pages = {70--79} +} +@inproceedings{Shan_Tsai_Zhang.DSPM.2017, + title = {Distributed Shared Persistent Memory}, + author = {Shan, Yizhou and Tsai, Shin-Yeh and Zhang, Yiying}, + year = 2017, + booktitle = {Proceedings of the 2017 Symposium on Cloud Computing}, + location = {Santa Clara, California}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + series = {SoCC '17}, + pages = {323–337}, + doi = {10.1145/3127479.3128610}, + isbn = 9781450350280, + url = {https://doi.org/10.1145/3127479.3128610}, + abstract = {Next-generation non-volatile memories (NVMs) will provide byte addressability, persistence, high density, and DRAM-like performance. They have the potential to benefit many datacenter applications. However, most previous research on NVMs has focused on using them in a single machine environment. It is still unclear how to best utilize them in distributed, datacenter environments.We introduce Distributed Shared Persistent Memory (DSPM), a new framework for using persistent memories in distributed data-center environments. DSPM provides a new abstraction that allows applications to both perform traditional memory load and store instructions and to name, share, and persist their data.We built Hotpot, a kernel-level DSPM system that provides low-latency, transparent memory accesses, data persistence, data reliability, and high availability. The key ideas of Hotpot are to integrate distributed memory caching and data replication techniques and to exploit application hints. We implemented Hotpot in the Linux kernel and demonstrated its benefits by building a distributed graph engine on Hotpot and porting a NoSQL database to Hotpot. Our evaluation shows that Hotpot outperforms a recent distributed shared memory system by 1.3\texttimes{} to 3.2\texttimes{} and a recent distributed PM-based file system by 1.5\texttimes{} to 3.0\texttimes{}.}, + numpages = 15, + keywords = {distributed shared memory, persistent memory} +} +@article{LaRowe_Ellis.Repl_NUMA.1991, + title = {Page placement policies for NUMA multiprocessors}, + author = {Richard P. LaRowe and Carla Schlatter Ellis}, + year = 1991, + journal = {Journal of Parallel and Distributed Computing}, + volume = 11, + number = 2, + pages = {112--129}, + doi = {https://doi.org/10.1016/0743-7315(91)90117-R}, + issn = {0743-7315}, + url = {https://www.sciencedirect.com/science/article/pii/074373159190117R}, + abstract = {In many parallel applications, the size of the program's data exceeds even the very large amount of main memory available on large-scale multiprocessors. Virtual memory, in the sense of a transparent management of the main/secondary memory hierarchy, is a natural solution. The replacement, fetch, and placement policies used in uniprocessor paging systems need to be reexamined in light of the differences in the behavior of parallel computations and in the memory architectures of multiprocessors. In particular, we investigate the impact of page placement in nonuniform memory access time (NUMA) shared memory MIMD machines. We experimentally evaluate several paging algorithms that incorporate different approaches to the placement issue. Under certain workload assumptions, our results show that placement algorithms that are strongly biased toward local frame allocation but are able to borrow remote frames can reduce the number of page faults over strictly local allocation. The increased cost of memory operations due to the extra remote accesses is more than compensated for by the savings resulting from the reduction in demand fetches, effectively reducing the computation completion time for these programs without having adverse effects on the performance of “typical” NUMA programs. We also discuss some early results obtained from an actual kernel implementation of one of our page placement algorithms.} +} +@article{Aguilar_Leiss.Coherence-Replacement.2006, + title = {A Coherence-Replacement Protocol For Web Proxy Cache Systems}, + author = {J. Aguilar and E.L. Leiss}, + year = 2006, + journal = {International Journal of Computers and Applications}, + publisher = {Taylor & Francis}, + volume = 28, + number = 1, + pages = {12--18}, + doi = {10.1080/1206212X.2006.11441783}, + url = {https://doi.org/10.1080/1206212X.2006.11441783}, + eprint = {https://doi.org/10.1080/1206212X.2006.11441783} +} +@inproceedings{Masouros_etal.Adrias.2023, + title = {Adrias: Interference-Aware Memory Orchestration for Disaggregated Cloud Infrastructures}, + author = {Masouros, Dimosthenis and Pinto, Christian and Gazzetti, Michele and Xydis, Sotirios and Soudris, Dimitrios}, + year = 2023, + booktitle = {2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)}, + pages = {855--869}, + organization = {IEEE} +} +@book{BOOK.Hennessy_Patterson.CArch.2011, + title = {Computer architecture: a quantitative approach}, + author = {Hennessy, John L and Patterson, David A}, + year = 2011, + publisher = {Elsevier} +} +@inproceedings{Cabezas_etal.GPU-SM.2015, + title = {GPU-SM: shared memory multi-GPU programming}, + author = {Cabezas, Javier and Jord{\`a}, Marc and Gelado, Isaac and Navarro, Nacho and Hwu, Wen-mei}, + year = 2015, + booktitle = {Proceedings of the 8th Workshop on General Purpose Processing using GPUs}, + pages = {13--24} +} +@misc{WEB.NVIDIA.Harris.Unified_Memory_CUDA.2017, + title = {Unified memory for cuda beginners}, + author = {Harris, Mark}, + year = 2017, + journal = {Unified Memory for CUDA Beginners}, + publisher = {NVIDIA}, + url = {https://developer.nvidia.com/blog/unified-memory-cuda-beginners/} +} +@article{Khokhar_etal.HetComputingVision.1993, + title = {Heterogeneous computing: Challenges and opportunities}, + author = {Khokhar, Ashfaq A. and Prasanna, Viktor K. and Shaaban, Muhammad E. and Wang, C-L}, + year = 1993, + journal = {Computer}, + publisher = {IEEE}, + volume = 26, + number = 6, + pages = {18--27} +} +@misc{WEB.LWN.Corbet.HMM_GPL_woes.2018, + title = {Heterogeneous memory management meets EXPORT\_SYMBOL\_GPL()}, + author = {Corbet, Jonathan}, + year = 2018, + journal = {LWN.net}, + publisher = {LWN.net}, + url = {https://lwn.net/Articles/757124/} +} +@misc{WEB.Phoronix..HMM_Search_Results.2023, + journal = {Heterogeneous Memory Management - Phoronix}, + publisher = {Phoronix}, + url = {https://www.phoronix.com/search/Heterogeneous%20Memory%20Management} +} +@inproceedings{narayanan2020heterogeneity, + title = {$\{$Heterogeneity-Aware$\}$ cluster scheduling policies for deep learning workloads}, + author = {Narayanan, Deepak and Santhanam, Keshav and Kazhamiaka, Fiodar and Phanishayee, Amar and Zaharia, Matei}, + year = 2020, + booktitle = {14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)}, + pages = {481--498} +} +@article{Rodriguez_etal.HPC_Cluster_Migration.2019, + title = {Job migration in hpc clusters by means of checkpoint/restart}, + author = {Rodr{\'\i}guez-Pascual, Manuel and Cao, Jiajun and Mor{\'\i}{\~n}igo, Jos{\'e} A and Cooperman, Gene and Mayo-Garc{\'\i}a, Rafael}, + year = 2019, + journal = {The Journal of Supercomputing}, + publisher = {Springer}, + volume = 75, + pages = {6517--6541} +} +@inproceedings{Oh_Kim.Container_Migration.2018, + title = {Stateful Container Migration employing Checkpoint-based Restoration for Orchestrated Container Clusters}, + author = {Oh, SeungYong and Kim, JongWon}, + year = 2018, + booktitle = {2018 International Conference on Information and Communication Technology Convergence (ICTC)}, + volume = {}, + number = {}, + pages = {25--30}, + doi = {10.1109/ICTC.2018.8539562} +} +@article{Amza_etal.Treadmarks.1996, + title={Treadmarks: Shared memory computing on networks of workstations}, + author={Amza, Cristiana and Cox, Alan L and Dwarkadas, Sandhya and Keleher, Pete and Lu, Honghui and Rajamony, Ramakrishnan and Yu, Weimin and Zwaenepoel, Willy}, + journal={Computer}, + volume={29}, + number={2}, + pages={18--28}, + year={1996}, + publisher={IEEE} +} +@article{Carter_Bennett_Zwaenepoel.Munin.1991, + title={Implementation and performance of Munin}, + author={Carter, John B and Bennett, John K and Zwaenepoel, Willy}, + journal={ACM SIGOPS Operating Systems Review}, + volume={25}, + number={5}, + pages={152--164}, + year={1991}, publisher={ACM New York, NY, USA} } - -@inproceedings{Yang_etal.FIFO-LPQD.2023, - title={FIFO can be Better than LRU: the Power of Lazy Promotion and Quick Demotion}, - author={Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, KV}, - booktitle={Proceedings of the 19th Workshop on Hot Topics in Operating Systems}, - pages={70--79}, - year={2023} +@article{Itzkovitz_Schuster_Shalev.Millipede.1998, + title={Thread migration and its applications in distributed shared memory systems}, + author={Itzkovitz, Ayal and Schuster, Assaf and Shalev, Lea}, + journal={Journal of Systems and Software}, + volume={42}, + number={1}, + pages={71--87}, + year={1998}, + publisher={Elsevier} } - -@inproceedings{Shan_Tsai_Zhang.DSPM.2017, -author = {Shan, Yizhou and Tsai, Shin-Yeh and Zhang, Yiying}, -title = {Distributed Shared Persistent Memory}, -year = {2017}, -isbn = {9781450350280}, -publisher = {Association for Computing Machinery}, -address = {New York, NY, USA}, -url = {https://doi.org/10.1145/3127479.3128610}, -doi = {10.1145/3127479.3128610}, -abstract = {Next-generation non-volatile memories (NVMs) will provide byte addressability, persistence, high density, and DRAM-like performance. They have the potential to benefit many datacenter applications. However, most previous research on NVMs has focused on using them in a single machine environment. It is still unclear how to best utilize them in distributed, datacenter environments.We introduce Distributed Shared Persistent Memory (DSPM), a new framework for using persistent memories in distributed data-center environments. DSPM provides a new abstraction that allows applications to both perform traditional memory load and store instructions and to name, share, and persist their data.We built Hotpot, a kernel-level DSPM system that provides low-latency, transparent memory accesses, data persistence, data reliability, and high availability. The key ideas of Hotpot are to integrate distributed memory caching and data replication techniques and to exploit application hints. We implemented Hotpot in the Linux kernel and demonstrated its benefits by building a distributed graph engine on Hotpot and porting a NoSQL database to Hotpot. Our evaluation shows that Hotpot outperforms a recent distributed shared memory system by 1.3\texttimes{} to 3.2\texttimes{} and a recent distributed PM-based file system by 1.5\texttimes{} to 3.0\texttimes{}.}, -booktitle = {Proceedings of the 2017 Symposium on Cloud Computing}, -pages = {323–337}, -numpages = {15}, -keywords = {distributed shared memory, persistent memory}, -location = {Santa Clara, California}, -series = {SoCC '17} +@inproceedings{Hu_Shi_Tang.JIAJIA.1999, + title={JIAJIA: A software DSM system based on a new cache coherence protocol}, + author={Hu, Weiwu and Shi, Weisong and Tang, Zhimin}, + booktitle={High-Performance Computing and Networking: 7th International Conference, HPCN Europe 1999 Amsterdam, The Netherlands, April 12--14, 1999 Proceedings 7}, + pages={461--472}, + year={1999}, + organization={Springer} } - -@article{LaRowe_Ellis.Repl_NUMA.1991, -title = {Page placement policies for NUMA multiprocessors}, -journal = {Journal of Parallel and Distributed Computing}, -volume = {11}, -number = {2}, -pages = {112-129}, -year = {1991}, -issn = {0743-7315}, -doi = {https://doi.org/10.1016/0743-7315(91)90117-R}, -url = {https://www.sciencedirect.com/science/article/pii/074373159190117R}, -author = {Richard P. LaRowe and Carla Schlatter Ellis}, -abstract = {In many parallel applications, the size of the program's data exceeds even the very large amount of main memory available on large-scale multiprocessors. Virtual memory, in the sense of a transparent management of the main/secondary memory hierarchy, is a natural solution. The replacement, fetch, and placement policies used in uniprocessor paging systems need to be reexamined in light of the differences in the behavior of parallel computations and in the memory architectures of multiprocessors. In particular, we investigate the impact of page placement in nonuniform memory access time (NUMA) shared memory MIMD machines. We experimentally evaluate several paging algorithms that incorporate different approaches to the placement issue. Under certain workload assumptions, our results show that placement algorithms that are strongly biased toward local frame allocation but are able to borrow remote frames can reduce the number of page faults over strictly local allocation. The increased cost of memory operations due to the extra remote accesses is more than compensated for by the savings resulting from the reduction in demand fetches, effectively reducing the computation completion time for these programs without having adverse effects on the performance of “typical” NUMA programs. We also discuss some early results obtained from an actual kernel implementation of one of our page placement algorithms.} +@inproceedings {Zaharia_etal.RDD.2012, + author = {Matei Zaharia and Mosharaf Chowdhury and Tathagata Das and Ankur Dave and Justin Ma and Murphy McCauly and Michael J. Franklin and Scott Shenker and Ion Stoica}, + title = {Resilient Distributed Datasets: A {Fault-Tolerant} Abstraction for {In-Memory} Cluster Computing}, + booktitle = {9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12)}, + year = {2012}, + isbn = {978-931971-92-8}, + address = {San Jose, CA}, + pages = {15--28}, + url = {https://www.usenix.org/conference/nsdi12/technical-sessions/presentation/zaharia}, + publisher = {USENIX Association}, + month = apr } - -@article{Aguilar_Leiss.Coherence-Replacement.2006, -author = {J. Aguilar and E.L. Leiss}, -title = {A Coherence-Replacement Protocol For Web Proxy Cache Systems}, -journal = {International Journal of Computers and Applications}, -volume = {28}, -number = {1}, -pages = {12-18}, -year = {2006}, -publisher = {Taylor & Francis}, -doi = {10.1080/1206212X.2006.11441783}, - - -URL = { - - https://doi.org/10.1080/1206212X.2006.11441783 - - - -}, -eprint = { - - https://doi.org/10.1080/1206212X.2006.11441783 - - - +@misc{WEB.APACHE..Apache_Hadoop.2023, + url={https://hadoop.apache.org/}, + journal={Apache Hadoop}, + publisher={The APACHE Software Foundation} } - +@misc{WEB.APACHE..Apache_Spark.2023, + url={https://spark.apache.org/}, + journal={Apache SparkTM - Unified Engine for large-scale data analytics}, + publisher={The APACHE Software Foundation} } - -@inproceedings{Masouros_etal.Adrias.2023, - title={Adrias: Interference-Aware Memory Orchestration for Disaggregated Cloud Infrastructures}, - author={Masouros, Dimosthenis and Pinto, Christian and Gazzetti, Michele and Xydis, Sotirios and Soudris, Dimitrios}, - booktitle={2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)}, - pages={855--869}, - year={2023}, +@article{Lenoski_etal.Stanford_DASH.1992, + title={The stanford dash multiprocessor}, + author={Lenoski, Daniel and Laudon, James and Gharachorloo, Kourosh and Weber, W-D and Gupta, Anoop and Hennessy, John and Horowitz, Mark and Lam, Monica S.}, + journal={Computer}, + volume={25}, + number={3}, + pages={63--79}, + year={1992}, + publisher={IEEE} +} +@misc{WEB.Ampere..Ampere_Altra_Datasheet.2023, + url={https://uawartifacts.blob.core.windows.net/upload-files/Altra_Max_Rev_A1_DS_v1_15_20230809_b7cdce449e_424d129849.pdf}, + journal={Ampere Altra Max Rev A1 64-Bit Multi-Core Processor Datasheet}, + publisher={Ampere Computing} +} +@article{Bell_Gray.HPC_is_Cluster.2002, + title={What's next in high-performance computing?}, + author={Bell, Gordon and Gray, Jim}, + journal={Communications of the ACM}, + volume={45}, + number={2}, + pages={91--95}, + year={2002}, + publisher={ACM New York, NY, USA} +} +@inproceedings{Werstein_Pethick_Huang.PerfAnalysis_DSM_MPI.2003, + title={A performance comparison of dsm, pvm, and mpi}, + author={Werstein, Paul and Pethick, Mark and Huang, Zhiyi}, + booktitle={Proceedings of the Fourth International Conference on Parallel and Distributed Computing, Applications and Technologies}, + pages={476--482}, + year={2003}, organization={IEEE} } +@inproceedings{Lu_etal.MPI_vs_DSM_over_cluster.1995, + title={Message passing versus distributed shared memory on networks of workstations}, + author={Lu, Honghui and Dwarkadas, Sandhya and Cox, Alan L and Zwaenepoel, Willy}, + booktitle={Supercomputing'95: Proceedings of the 1995 ACM/IEEE Conference on Supercomputing}, + pages={37--37}, + year={1995}, + organization={IEEE} +} +@article{Jia_etal.Tensorflow_over_RDMA.2018, + title={Improving the performance of distributed tensorflow with RDMA}, + author={Jia, Chengfan and Liu, Junnan and Jin, Xu and Lin, Han and An, Hong and Han, Wenting and Wu, Zheng and Chi, Mengxian}, + journal={International Journal of Parallel Programming}, + volume={46}, + pages={674--685}, + year={2018}, + publisher={Springer} +} +@inproceedings{Lu_etal.Spark_over_RDMA.2014, + title={Accelerating spark with RDMA for big data processing: Early experiences}, + author={Lu, Xiaoyi and Rahman, Md Wasi Ur and Islam, Nusrat and Shankar, Dipti and Panda, Dhabaleswar K}, + booktitle={2014 IEEE 22nd Annual Symposium on High-Performance Interconnects}, + pages={9--16}, + year={2014}, + organization={IEEE} +} +@article{Cai_etal.Distributed_Memory_RDMA_Cached.2018, + title={Efficient distributed memory management with RDMA and caching}, + author={Cai, Qingchao and Guo, Wentian and Zhang, Hao and Agrawal, Divyakant and Chen, Gang and Ooi, Beng Chin and Tan, Kian-Lee and Teo, Yong Meng and Wang, Sheng}, + journal={Proceedings of the VLDB Endowment}, + volume={11}, + number={11}, + pages={1604--1617}, + year={2018}, + publisher={VLDB Endowment} +} +@inproceedings{Nelson_etal.Grappa_DSM.2015, + title={$\{$Latency-Tolerant$\}$ software distributed shared memory}, + author={Nelson, Jacob and Holt, Brandon and Myers, Brandon and Briggs, Preston and Ceze, Luis and Kahan, Simon and Oskin, Mark}, + booktitle={2015 USENIX Annual Technical Conference (USENIX ATC 15)}, + pages={291--305}, + year={2015} +} + + + + diff --git a/tex/misc/background_draft.pdf b/tex/misc/background_draft.pdf index a8e6831..d7f6835 100644 Binary files a/tex/misc/background_draft.pdf and b/tex/misc/background_draft.pdf differ diff --git a/tex/misc/background_draft.tex b/tex/misc/background_draft.tex index 14437a9..fa46d48 100644 --- a/tex/misc/background_draft.tex +++ b/tex/misc/background_draft.tex @@ -1,29 +1,136 @@ \documentclass{article} +\usepackage[utf8]{inputenc} +\usepackage[dvipsnames]{xcolor} \usepackage{biblatex} \addbibresource{background_draft.bib} \begin{document} -% \chapter{Backgrounds} -Recent studies has shown a reinvigorated interest in disaggregated/distributed -shared memory systems last seen in the 1990s. While large-scale cluster systems -remain predominantly the solution for massively parallel computation, it is known -to -The interplay between (page) replacement policy and runtime performance of -distributed shared memory systems has not been properly explored. +Though large-scale cluster systems remain the dominant solution for request and +data-level parallelism \cite{BOOK.Hennessy_Patterson.CArch.2011}, +there have been a resurgence towards applying HPC techniques (e.g., DSM) for more +efficient heterogeneous computation with more tightly-coupled heterogeneous nodes +providing (hardware) acceleration for one another \cite{Cabezas_etal.GPU-SM.2015} +\textcolor{red}{[ADD MORE CITATIONS]} Within the scope of one node, +\emph{heterogeneous memory management (HMM)} enables the use of OS-controlled, +unified memory view into the entire memory landscape across attached devices +\cite{WEB.NVIDIA.Harris.Unified_Memory_CUDA.2017}, all while using the same libc +function calls as one would with SMP programming, the underlying complexities of +memory ownership and locality managed by the OS kernel. -\section{Overview of Distributed Shared Memory} +Nevertheless, while HMM promises a distributed shared memory approach towards +exposing CPU and peripheral memory, applications (drivers and front-ends) that +exploit HMM to provide ergonomic programming models remain fragmented and +narrowly-focused. Existing efforts in exploiting HMM in Linux predominantly focus +on exposing global address space abstraction to GPU memory -- a largely +non-coordinated effort surrounding both \textit{in-tree} and proprietary code +\cites{WEB.LWN.Corbet.HMM_GPL_woes.2018}{WEB.Phoronix..HMM_Search_Results.2023}. +Limited effort have been done on incorporating HMM into other variants of +accelerators in various system topologies. -A striking feature in the study of distributed shared memory (DSM) systems is the -non-uniformity of the terminologies used to describe overlapping study interests. -The majority of contributions to DSM study come from the 1990s, for example -\textbf{[Treadmark, Millipede, Munin, Shiva, etc.]}. These DSM systems attempt to -leverage kernel system calls to allow for user-level DSM over ethernet NICs. While -these systems provide a strong theoretical basis for today's majority-software -DSM systems and applications that expose a \emph{(partitioned) global address space}, -they were nevertheless constrained by the limitations in NIC transfer rate and -bandwidth, and the concept of DSM failed to take off (relative to cluster computing). +Orthogonally, allocation of hardware accelerator resources in a cluster computing +environment becomes difficult when the required hardware acceleration resources +of one workload cannot be easily determined and/or isolated. Within a cluster +system there may exist a large amount of general-purpose worker nodes and limited +amount of hardware-accelerated nodes. Further, it is possible that every workload +performed on this cluster wishes for hardware acceleration from time to time, +but never for a relatively long time. Many job scheduling mechanisms within a cluster +\emph{move data near computation} by migrating the entire job/container between +general-purpose and accelerator nodes \cites{Rodriguez_etal.HPC_Cluster_Migration.2019} +{Oh_Kim.Container_Migration.2018}. This way of migration naturally incurs +large overhead -- accelerator nodes which strictly perform in-memory computing +without ever needing to touch the container's filesystem should not have to install +the entire filesystem locally, for starters. Moreover, must \emph{all} computations be +near data? \cite{Masouros_etal.Adrias.2023}, for example, shows that RDMA over +fast network interfaces ($25 \times 8$Gbps) result negligible impact on tail latencies +but high impact on throughput when bandwidth is maximized. +This thesis paper builds upon an ongoing research effort in implementing a +tightly coupled cluster where HMM abstractions allow for transparent RDMA access +from accelerator nodes to local data and data migration near computation, focusing +on the effect of replacement policies on balancing the cost between near-data and +far-data computation between home node and accelerator node. \textcolor{red}{ +Specifically, this paper explores the possibility of implementing shared page +movement between home and accelerator nodes to enable efficient memory over-commit +without the I/O-intensive swapping overhead.} + +\textcolor{red}{The rest of the chapter is structured as follows\dots} + +\section{Experiences from Software DSM} +The majority of contributions to the study of software DSM systems come from the +1990s \cites{Amza_etal.Treadmarks.1996}{Carter_Bennett_Zwaenepoel.Munin.1991} +{Itzkovitz_Schuster_Shalev.Millipede.1998}{Hu_Shi_Tang.JIAJIA.1999}. These +developments follow from the success of the Stanford DASH project in the late +1980s -- a hardware distributed shared memory (i.e., NUMA) implementation of a +multiprocessor that first proposed the \textit{directory-based protocol} for +cache coherence, which stores the ownership information of cache lines to reduce +unnecessary communication that prevented SMP processors from scaling out +\cite{Lenoski_etal.Stanford_DASH.1992}. + +While developments in hardware DSM materialized into a universal approach to +cache-coherence in contemporary many-core processors (e.g., \textit{Ampere +Altra}\cite{WEB.Ampere..Ampere_Altra_Datasheet.2023}), software DSMs in clustered +computing languished in favor of loosely-coupled nodes performing data-parallel +computation, communicating via message-passing. Bandwidth limitations with the +network interfaces of the late 1990s was insufficient to support the high traffic +incurred by DSM and its programming model \cites{Werstein_Pethick_Huang.PerfAnalysis_DSM_MPI.2003} +{Lu_etal.MPI_vs_DSM_over_cluster.1995}. + +New developments in network interfaces provides much improved bandwidth and latency +compared to ethernet in the 1990s. RDMA-capable NICs have been shown to improve +the training efficiency sixfold compared to distributed TensorFlow via RPC, +scaling positively over non-distributed training \cite{Jia_etal.Tensorflow_over_RDMA.2018}. +Similar results have been observed for Spark\cite{Lu_etal.Spark_over_RDMA.2014} +\textcolor{red}{and what?}. Consequently, there have been a resurgence of interest +in software DSM systems and their corresponding programming models +\cites{Nelson_etal.Grappa_DSM.2015}{Cai_etal.Distributed_Memory_RDMA_Cached.2018}. + +% Different to DSM-over-RDMA, we try to expose RDMA as device with HMM capability +% i.e., we do it in kernel as opposed to in userspace. Accelerator node can access +% local node's shared page like a DMA device do so via HMM. + +\subsection{Munin: Multiple Consistency Protocols} +\textit{Munin}\cite{Carter_Bennett_Zwaenepoel.Munin.1991} is one of the older +developments in software DSM systems. The authors of Munin identify that +\textit{false-sharing}, occurring due to multiple processors writing to different +offsets of the same page triggering invalidations, is strongly detrimental to the +performance of shared-memory systems. To combat this, Munin exposes annotations +as part of its programming model to facilitate multiple consistency protocols on +top of release consistency. An immutable shared memory object across readers, +for example, can be safely copied without concern for coherence between processors. +On the other hand, the \textit{write-shared} annotation explicates that a memory +object is written by multiple processors without synchronization -- i.e., the +programmer guarantees that only false-sharing occurs within this granularity. +Annotations such as these explicitly disables subsets of consistency procedures +to reduce communication in the network fabric, thereby improving the performance +of the DSM system. + +Perhaps most importantly, experiences from Munin show that \emph{restricting the +flexibility of programming model can lead to more performant coherence models}, as +\textcolor{teal}{corroborated} by the now-foundational +\textit{Resilient Distributed Database} paper \cite{Zaharia_etal.RDD.2012} -- +which powered many now-popular scalable data processing frameworks such as +\textit{Hadoop MapReduce}\cite{WEB.APACHE..Apache_Hadoop.2023} and +\textit{APACHE Spark}\cite{WEB.APACHE..Apache_Spark.2023}. ``To achieve fault +tolerance efficiently, RDDs provide a restricted form of shared memory +[based on]\dots transformations rather than\dots updates to shared state'' +\cite{Zaharia_etal.RDD.2012}. This allows for the use of transformation logs to +cheaply synchronize states between unshared address spaces -- a much desired +property for highly scalable, loosely-coupled clustered systems. + +\subsection{Treadmarks: Multi-Writer Protocol} +\textit{Treadmarks}\cite{Amza_etal.Treadmarks.1996} is a software DSM developed in +1996 + +% The majority of contributions to DSM study come from the 1990s, for example +% \textbf{[Treadmark, Millipede, Munin, Shiva, etc.]}. These DSM systems attempt to +% leverage kernel system calls to allow for user-level DSM over ethernet NICs. While +% these systems provide a strong theoretical basis for today's majority-software +% DSM systems and applications that expose a \emph{(partitioned) global address space}, +% they were nevertheless constrained by the limitations in NIC transfer rate and +% bandwidth, and the concept of DSM failed to take off (relative to cluster computing). + +\section{HPC and Partitioned Global Address Space} Improvement in NIC bandwidth and transfer rate allows for applications that expose global address space, as well as RDMA technologies that leverage single-writer protocols over hierarchical memory nodes. \textbf{[GAS and PGAS (Partitioned GAS) @@ -40,6 +147,8 @@ processing frameworks, for example APACHE Spark, Memcached, and Redis, over no-disaggregation (i.e., using node-local memory only, similar to cluster computing) systems. +\subsection{Programming Model} + \subsection{Move Data to Process, or Move Process to Data?} (TBD -- The former is costly for data-intensive computation, but the latter may be impossible for certain tasks, and greatly hardens the replacement problem.)