668 lines
No EOL
32 KiB
BibTeX
668 lines
No EOL
32 KiB
BibTeX
@article{Aguilar_Leiss.Coherence-Replacement.2006,
|
||
title = {A Coherence-Replacement Protocol For Web Proxy Cache Systems},
|
||
author = {J. Aguilar and E.L. Leiss},
|
||
year = 2006,
|
||
journal = {International Journal of Computers and Applications},
|
||
publisher = {Taylor & Francis},
|
||
volume = 28,
|
||
number = 1,
|
||
pages = {12--18},
|
||
doi = {10.1080/1206212X.2006.11441783},
|
||
url = {https://doi.org/10.1080/1206212X.2006.11441783},
|
||
eprint = {https://doi.org/10.1080/1206212X.2006.11441783}
|
||
}
|
||
@article{Amza_etal.Treadmarks.1996,
|
||
title = {Treadmarks: Shared memory computing on networks of workstations},
|
||
author = {Amza, Cristiana and Cox, Alan L and Dwarkadas, Sandhya and Keleher, Pete and Lu, Honghui and Rajamony, Ramakrishnan and Yu, Weimin and Zwaenepoel, Willy},
|
||
journal = {Computer},
|
||
volume = {29},
|
||
number = {2},
|
||
pages = {18--28},
|
||
year = {1996},
|
||
publisher = {IEEE}
|
||
}
|
||
@misc{ARM.ARMv8-A.v1.0.2015,
|
||
title = {ARM® Cortex®-A Series Programmer's Guide for ARMv8-A},
|
||
url = {https://developer.arm.com/documentation/den0024/a},
|
||
journal = {Documentation - arm developer},
|
||
publisher = {ARM},
|
||
author = {ARM},
|
||
year = {2015}
|
||
}
|
||
@book{AST_Steen.Distributed_Systems-3ed.2017,
|
||
title = {Distributed systems},
|
||
author = {Van Steen, Maarten and Tanenbaum, Andrew S},
|
||
year = {2017},
|
||
publisher = {Maarten van Steen Leiden, The Netherlands}
|
||
}
|
||
@article{Bell_Gray.HPC_is_Cluster.2002,
|
||
title = {What's next in high-performance computing?},
|
||
author = {Bell, Gordon and Gray, Jim},
|
||
journal = {Communications of the ACM},
|
||
volume = {45},
|
||
number = {2},
|
||
pages = {91--95},
|
||
year = {2002},
|
||
publisher = {ACM New York, NY, USA}
|
||
}
|
||
@book{BOOK.Hennessy_Patterson.CArch.2011,
|
||
title = {Computer architecture: a quantitative approach},
|
||
author = {Hennessy, John L and Patterson, David A},
|
||
year = 2011,
|
||
publisher = {Elsevier}
|
||
}
|
||
@inproceedings{Cabezas_etal.GPU-SM.2015,
|
||
title = {GPU-SM: shared memory multi-GPU programming},
|
||
author = {Cabezas, Javier and Jord{\`a}, Marc and Gelado, Isaac and Navarro, Nacho and Hwu, Wen-mei},
|
||
year = 2015,
|
||
booktitle = {Proceedings of the 8th Workshop on General Purpose Processing using GPUs},
|
||
pages = {13--24}
|
||
}
|
||
@article{Cai_etal.Distributed_Memory_RDMA_Cached.2018,
|
||
title = {Efficient distributed memory management with RDMA and caching},
|
||
author = {Cai, Qingchao and Guo, Wentian and Zhang, Hao and Agrawal, Divyakant and Chen, Gang and Ooi, Beng Chin and Tan, Kian-Lee and Teo, Yong Meng and Wang, Sheng},
|
||
journal = {Proceedings of the VLDB Endowment},
|
||
volume = {11},
|
||
number = {11},
|
||
pages = {1604--1617},
|
||
year = {2018},
|
||
publisher = {VLDB Endowment}
|
||
}
|
||
@article{Carter_Bennett_Zwaenepoel.Munin.1991,
|
||
title = {Implementation and performance of Munin},
|
||
author = {Carter, John B and Bennett, John K and Zwaenepoel, Willy},
|
||
journal = {ACM SIGOPS Operating Systems Review},
|
||
volume = {25},
|
||
number = {5},
|
||
pages = {152--164},
|
||
year = {1991},
|
||
publisher = {ACM New York, NY, USA}
|
||
}
|
||
@inproceedings{Chaiken_Kubiatowicz_Agarwal.LimitLESS-with-Alewife.1991,
|
||
author = {Chaiken, David and Kubiatowicz, John and Agarwal, Anant},
|
||
title = {LimitLESS directories: A scalable cache coherence scheme},
|
||
year = {1991},
|
||
isbn = {0897913809},
|
||
publisher = {Association for Computing Machinery},
|
||
address = {New York, NY, USA},
|
||
url = {https://doi.org/10.1145/106972.106995},
|
||
doi = {10.1145/106972.106995},
|
||
booktitle = {Proceedings of the Fourth International Conference on Architectural Support for Programming Languages and Operating Systems},
|
||
pages = {224–234},
|
||
numpages = {11},
|
||
location = {Santa Clara, California, USA},
|
||
series = {ASPLOS IV}
|
||
}
|
||
@misc{Corbet.LWN-NC-DMA.2021,
|
||
url = {https://lwn.net/Articles/855328/},
|
||
journal = {Noncoherent DMA mappings},
|
||
publisher = {LWN.net},
|
||
author = {Corbet, Jonathan},
|
||
year = {2021}
|
||
}
|
||
@inproceedings{Couceiro_etal.D2STM.2009,
|
||
title = {D2STM: Dependable distributed software transactional memory},
|
||
author = {Couceiro, Maria and Romano, Paolo and Carvalho, Nuno and Rodrigues, Lu{\'\i}s},
|
||
booktitle = {2009 15th IEEE Pacific Rim International Symposium on Dependable Computing},
|
||
pages = {307--313},
|
||
year = {2009},
|
||
organization = {IEEE}
|
||
}
|
||
@article{De_Wael_etal.PGAS_Survey.2015,
|
||
title = {Partitioned global address space languages},
|
||
author = {De Wael, Mattias and Marr, Stefan and De Fraine, Bruno and Van Cutsem, Tom and De Meuter, Wolfgang},
|
||
journal = {ACM Computing Surveys (CSUR)},
|
||
volume = {47},
|
||
number = {4},
|
||
pages = {1--27},
|
||
year = {2015},
|
||
publisher = {ACM New York, NY, USA}
|
||
}
|
||
@inproceedings{Ding.vDSM.2018,
|
||
author = {Ding, Zhuocheng},
|
||
booktitle = {2018 IEEE 9th International Conference on Software Engineering and Service Science (ICSESS)},
|
||
title = {vDSM: Distributed Shared Memory in Virtualized Environments},
|
||
year = {2018},
|
||
volume = {},
|
||
number = {},
|
||
pages = {1112-1115},
|
||
keywords = {Virtual machine monitors;Optimization;Protocols;Virtualization;Operating systems;Stress;Analytical models;component;distributed shared memory;virtuali-zation;low-latency network},
|
||
doi = {10.1109/ICSESS.2018.8663720}
|
||
}
|
||
@inproceedings{Eisley_Peh_Shang.In-net-coherence.2006,
|
||
title = {In-network cache coherence},
|
||
author = {Eisley, Noel and Peh, Li-Shiuan and Shang, Li},
|
||
booktitle = {2006 39th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO'06)},
|
||
pages = {321--332},
|
||
year = {2006},
|
||
organization = {IEEE}
|
||
}
|
||
@inproceedings{Endo_Sato_Taura.MENPS_DSM.2020,
|
||
title = {MENPS: a decentralized distributed shared memory exploiting RDMA},
|
||
author = {Endo, Wataru and Sato, Shigeyuki and Taura, Kenjiro},
|
||
booktitle = {2020 IEEE/ACM Fourth Annual Workshop on Emerging Parallel and Distributed Runtime Systems and Middleware (IPDRM)},
|
||
pages = {9--16},
|
||
year = {2020},
|
||
organization = {IEEE}
|
||
}
|
||
@article{Fleisch_Popek.Mirage.1989,
|
||
title = {Mirage: A coherent distributed shared memory design},
|
||
author = {Fleisch, Brett and Popek, Gerald},
|
||
journal = {ACM SIGOPS Operating Systems Review},
|
||
volume = {23},
|
||
number = {5},
|
||
pages = {211--223},
|
||
year = {1989},
|
||
publisher = {ACM New York, NY, USA}
|
||
}
|
||
@misc{FreeBSD.man-BPF-4.2021,
|
||
title = {FreeBSD manual pages},
|
||
url = {https://man.freebsd.org/cgi/man.cgi?query=bpf&manpath=FreeBSD+14.0-RELEASE+and+Ports},
|
||
journal = {BPF(4) Kernel Interfaces Manual},
|
||
publisher = {The FreeBSD Project},
|
||
author = {The FreeBSD Project},
|
||
year = {2021}
|
||
}
|
||
@inproceedings{Giri_Mantovani_Carloni.NoC-CC-over-SoC.2018,
|
||
title = {NoC-based support of heterogeneous cache-coherence models for accelerators},
|
||
author = {Giri, Davide and Mantovani, Paolo and Carloni, Luca P},
|
||
booktitle = {2018 Twelfth IEEE/ACM International Symposium on Networks-on-Chip (NOCS)},
|
||
pages = {1--8},
|
||
year = {2018},
|
||
organization = {IEEE}
|
||
}
|
||
@book{Holsapple.DSM64.2012,
|
||
title = {DSM64: A Distributed Shared Memory System in User-Space},
|
||
author = {Holsapple, Stephen Alan},
|
||
year = {2012},
|
||
publisher = {California Polytechnic State University}
|
||
}
|
||
@article{Hong_etal.NUMA-to-RDMA-DSM.2019,
|
||
title = {Scaling out NUMA-aware applications with RDMA-based distributed shared memory},
|
||
author = {Hong, Yang and Zheng, Yang and Yang, Fan and Zang, Bin-Yu and Guan, Hai-Bing and Chen, Hai-Bo},
|
||
journal = {Journal of Computer Science and Technology},
|
||
volume = {34},
|
||
pages = {94--112},
|
||
year = {2019},
|
||
publisher = {Springer}
|
||
}
|
||
@inproceedings{Hu_Shi_Tang.JIAJIA.1999,
|
||
title = {JIAJIA: A software DSM system based on a new cache coherence protocol},
|
||
author = {Hu, Weiwu and Shi, Weisong and Tang, Zhimin},
|
||
booktitle = {High-Performance Computing and Networking: 7th International Conference, HPCN Europe 1999 Amsterdam, The Netherlands, April 12--14, 1999 Proceedings 7},
|
||
pages = {461--472},
|
||
year = {1999},
|
||
organization = {Springer}
|
||
}
|
||
@misc{ISO/IEC_9899:2011.C11,
|
||
abstract = {Edition Status: Withdrawn on 2018-07-13},
|
||
isbn = {9780580801655},
|
||
keywords = {Data processing ; Data representation ; Languages used in information technology ; Programming ; Programming languages ; Semantics ; Syntax},
|
||
language = {eng},
|
||
publisher = {British Standards Institute},
|
||
title = {BS ISO/IEC 9899:2011: Information technology. Programming languages. C},
|
||
year = {2013}
|
||
}
|
||
@misc{ISO/IEC_JTC1_SC22_WG21_N2427.C++11.2007,
|
||
title = {C++ Atomic Types and Operations},
|
||
url = {https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2427.html},
|
||
journal = {C++ atomic types and operations},
|
||
publisher = {ISO/IEC JTC 1},
|
||
author = {Boehm, Hans J and Crowl, Lawrence},
|
||
year = {2007}
|
||
}
|
||
@article{Itzkovitz_Schuster_Shalev.Millipede.1998,
|
||
title = {Thread migration and its applications in distributed shared memory systems},
|
||
author = {Itzkovitz, Ayal and Schuster, Assaf and Shalev, Lea},
|
||
journal = {Journal of Systems and Software},
|
||
volume = {42},
|
||
number = {1},
|
||
pages = {71--87},
|
||
year = {1998},
|
||
publisher = {Elsevier}
|
||
}
|
||
@article{Jaleel_etal.RRIP.2010,
|
||
title = {High performance cache replacement using re-reference interval prediction (RRIP)},
|
||
author = {Jaleel, Aamer and Theobald, Kevin B and Steely Jr, Simon C and Emer, Joel},
|
||
year = 2010,
|
||
journal = {ACM SIGARCH computer architecture news},
|
||
publisher = {ACM New York, NY, USA},
|
||
volume = 38,
|
||
number = 3,
|
||
pages = {60--71}
|
||
}
|
||
@article{Jia_etal.Tensorflow_over_RDMA.2018,
|
||
title = {Improving the performance of distributed tensorflow with RDMA},
|
||
author = {Jia, Chengfan and Liu, Junnan and Jin, Xu and Lin, Han and An, Hong and Han, Wenting and Wu, Zheng and Chi, Mengxian},
|
||
journal = {International Journal of Parallel Programming},
|
||
volume = {46},
|
||
pages = {674--685},
|
||
year = {2018},
|
||
publisher = {Springer}
|
||
}
|
||
@inproceedings{Kaxiras_etal.DSM-Argos.2015,
|
||
author = {Kaxiras, Stefanos and Klaftenegger, David and Norgren, Magnus and Ros, Alberto and Sagonas, Konstantinos},
|
||
title = {Turning Centralized Coherence and Distributed Critical-Section Execution on their Head: A New Approach for Scalable Distributed Shared Memory},
|
||
year = {2015},
|
||
isbn = {9781450335508},
|
||
publisher = {Association for Computing Machinery},
|
||
address = {New York, NY, USA},
|
||
url = {https://doi.org/10.1145/2749246.2749250},
|
||
doi = {10.1145/2749246.2749250},
|
||
abstract = {A coherent global address space in a distributed system enables shared memory programming in a much larger scale than a single multicore or a single SMP. Without dedicated hardware support at this scale, the solution is a software distributed shared memory (DSM) system. However, traditional approaches to coherence (centralized via "active" home-node directories) and critical-section execution (distributed across nodes and cores) are inherently unfit for such a scenario. Instead, it is crucial to make decisions locally and avoid the long latencies imposed by both network and software message handlers. Likewise, synchronization is fast if it rarely involves communication with distant nodes (or even other sockets). To minimize the amount of long-latency communication required in both coherence and critical section execution, we propose a DSM system with a novel coherence protocol, and a novel hierarchical queue delegation locking approach. More specifically, we propose an approach, suitable for Data-Race-Free programs, based on self-invalidation, self-downgrade, and passive data classification directories that require no message handlers, thereby incurring no extra latency. For fast synchronization we extend Queue Delegation Locking to execute critical sections in large batches on a single core before passing execution along to other cores, sockets, or nodes, in that hierarchical order. The result is a software DSM system called Argo which localizes as many decisions as possible and allows high parallel performance with little overhead on synchronization when compared to prior DSM implementations.},
|
||
booktitle = {Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing},
|
||
pages = {3-14},
|
||
numpages = {12},
|
||
location = {Portland, Oregon, USA},
|
||
series = {HPDC '15}
|
||
}
|
||
@inproceedings{Khawaja_etal.AmorphOS.2018,
|
||
title = {Sharing, Protection, and Compatibility for Reconfigurable Fabric with $\{$AmorphOS$\}$},
|
||
author = {Khawaja, Ahmed and Landgraf, Joshua and Prakash, Rohith and Wei, Michael and Schkufza, Eric and Rossbach, Christopher J},
|
||
booktitle = {13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)},
|
||
pages = {107--127},
|
||
year = {2018}
|
||
}
|
||
@article{Khokhar_etal.HetComputingVision.1993,
|
||
title = {Heterogeneous computing: Challenges and opportunities},
|
||
author = {Khokhar, Ashfaq A. and Prasanna, Viktor K. and Shaaban, Muhammad E. and Wang, C-L},
|
||
year = 1993,
|
||
journal = {Computer},
|
||
publisher = {IEEE},
|
||
volume = 26,
|
||
number = 6,
|
||
pages = {18--27}
|
||
}
|
||
@inproceedings{Kim_etal.DeX-upon-Linux.2020,
|
||
author = {Kim, Sang-Hoon and Chuang, Ho-Ren and Lyerly, Robert and Olivier, Pierre and Min, Changwoo and Ravindran, Binoy},
|
||
booktitle = {2020 IEEE 40th International Conference on Distributed Computing Systems (ICDCS)},
|
||
title = {DeX: Scaling Applications Beyond Machine Boundaries},
|
||
year = {2020},
|
||
volume = {},
|
||
number = {},
|
||
pages = {864-876},
|
||
keywords = {Protocols;Instruction sets;Linux;Prototypes;Distributed databases;Programming;Kernel;Thread migration;distributed execution;distributed memory;RDMA},
|
||
doi = {10.1109/ICDCS47774.2020.00021}
|
||
}
|
||
|
||
@misc{Kjos_etal.HP-HW-CC-IO.1996,
|
||
copyright = {Copyright 2006 Elsevier B.V., All rights reserved.},
|
||
issn = {0018-1153},
|
||
journal = {Hewlett-Packard journal},
|
||
keywords = {Computer Science ; Computer Science, Hardware & Architecture ; Engineering ; Engineering, Electrical & Electronic ; Instruments & Instrumentation ; Science & Technology ; Technology},
|
||
language = {eng},
|
||
number = {1},
|
||
pages = {52-59},
|
||
publisher = {Hewlett-Packard Co},
|
||
abstract = {Hardware cache coherent I/O is a new feature of the PA-RISC architecture that involves the I/O hardware in ensuring cache coherence, thereby reducing CPU and memory overhead and increasing performance.},
|
||
author = {Kjos, Toddj and Nusbaum, Helen and Traynor, Michaelk and Voge, Brendana},
|
||
address = {PALO ALTO},
|
||
title = {Hardware cache coherent input/output},
|
||
volume = {47},
|
||
year = {1996}
|
||
}
|
||
|
||
@article{LaRowe_Ellis.Repl_NUMA.1991,
|
||
title = {Page placement policies for NUMA multiprocessors},
|
||
author = {Richard P. LaRowe and Carla Schlatter Ellis},
|
||
year = 1991,
|
||
journal = {Journal of Parallel and Distributed Computing},
|
||
volume = 11,
|
||
number = 2,
|
||
pages = {112--129},
|
||
doi = {https://doi.org/10.1016/0743-7315(91)90117-R},
|
||
issn = {0743-7315},
|
||
url = {https://www.sciencedirect.com/science/article/pii/074373159190117R},
|
||
abstract = {In many parallel applications, the size of the program's data exceeds even the very large amount of main memory available on large-scale multiprocessors. Virtual memory, in the sense of a transparent management of the main/secondary memory hierarchy, is a natural solution. The replacement, fetch, and placement policies used in uniprocessor paging systems need to be reexamined in light of the differences in the behavior of parallel computations and in the memory architectures of multiprocessors. In particular, we investigate the impact of page placement in nonuniform memory access time (NUMA) shared memory MIMD machines. We experimentally evaluate several paging algorithms that incorporate different approaches to the placement issue. Under certain workload assumptions, our results show that placement algorithms that are strongly biased toward local frame allocation but are able to borrow remote frames can reduce the number of page faults over strictly local allocation. The increased cost of memory operations due to the extra remote accesses is more than compensated for by the savings resulting from the reduction in demand fetches, effectively reducing the computation completion time for these programs without having adverse effects on the performance of “typical” NUMA programs. We also discuss some early results obtained from an actual kernel implementation of one of our page placement algorithms.}
|
||
}
|
||
|
||
@article{Lenoski_etal.Stanford_DASH.1992,
|
||
title = {The stanford dash multiprocessor},
|
||
author = {Lenoski, Daniel and Laudon, James and Gharachorloo, Kourosh and Weber, W-D and Gupta, Anoop and Hennessy, John and Horowitz, Mark and Lam, Monica S.},
|
||
journal = {Computer},
|
||
volume = {25},
|
||
number = {3},
|
||
pages = {63--79},
|
||
year = {1992},
|
||
publisher = {IEEE}
|
||
}
|
||
|
||
@inproceedings{Li_etal.RelDB_RDMA.2016,
|
||
title = {Accelerating relational databases by leveraging remote memory and RDMA},
|
||
author = {Li, Feng and Das, Sudipto and Syamala, Manoj and Narasayya, Vivek R},
|
||
booktitle = {Proceedings of the 2016 International Conference on Management of Data},
|
||
pages = {355--370},
|
||
year = {2016}
|
||
}
|
||
|
||
@inproceedings{Lu_etal.MPI_vs_DSM_over_cluster.1995,
|
||
title = {Message passing versus distributed shared memory on networks of workstations},
|
||
author = {Lu, Honghui and Dwarkadas, Sandhya and Cox, Alan L and Zwaenepoel, Willy},
|
||
booktitle = {Supercomputing'95: Proceedings of the 1995 ACM/IEEE Conference on Supercomputing},
|
||
pages = {37--37},
|
||
year = {1995},
|
||
organization = {IEEE}
|
||
}
|
||
|
||
@inproceedings{Lu_etal.Spark_over_RDMA.2014,
|
||
title = {Accelerating spark with RDMA for big data processing: Early experiences},
|
||
author = {Lu, Xiaoyi and Rahman, Md Wasi Ur and Islam, Nusrat and Shankar, Dipti and Panda, Dhabaleswar K},
|
||
booktitle = {2014 IEEE 22nd Annual Symposium on High-Performance Interconnects},
|
||
pages = {9--16},
|
||
year = {2014},
|
||
organization = {IEEE}
|
||
}
|
||
|
||
@inproceedings{Ma_etal.SHM_FPGA.2020,
|
||
title = {A hypervisor for shared-memory FPGA platforms},
|
||
author = {Ma, Jiacheng and Zuo, Gefei and Loughlin, Kevin and Cheng, Xiaohe and Liu, Yanqiang and Eneyew, Abel Mulugeta and Qi, Zhengwei and Kasikci, Baris},
|
||
booktitle = {Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems},
|
||
pages = {827--844},
|
||
year = {2020}
|
||
}
|
||
|
||
@misc{Manson_Goetz.JSR_133.Java_5.2004,
|
||
url = {https://www.cs.umd.edu/~pugh/java/memoryModel/jsr-133-faq.html},
|
||
journal = {JSR 133 (Java Memory Model) FAQ},
|
||
publisher = {Department of Computer Science, University of Maryland},
|
||
author = {Manson, Jeremy and Goetz, Brian},
|
||
year = {2004}
|
||
}
|
||
|
||
@misc{many.MSFTLearn-SMBDirect.2024,
|
||
title = {SMB Direct},
|
||
url = {https://learn.microsoft.com/en-us/windows-server/storage/file-server/smb-direct},
|
||
journal = {Microsoft Learn},
|
||
publisher = {Microsoft},
|
||
author = {Xelu86 and ManikaDhiman and dknappettmsft and v-alje and nedpyle and eross-msft and SubodhBhargava and JasonGerend and lizap and Heidilohr},
|
||
year = {2024}
|
||
}
|
||
|
||
@inproceedings{Masouros_etal.Adrias.2023,
|
||
title = {Adrias: Interference-Aware Memory Orchestration for Disaggregated Cloud Infrastructures},
|
||
author = {Masouros, Dimosthenis and Pinto, Christian and Gazzetti, Michele and Xydis, Sotirios and Soudris, Dimitrios},
|
||
year = 2023,
|
||
booktitle = {2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
|
||
pages = {855--869},
|
||
organization = {IEEE}
|
||
}
|
||
|
||
@misc{Miller_Henderson_Jelinek.Kernelv6.7-DMA_guide.2024,
|
||
title = {Dynamic DMA mapping Guide},
|
||
url = {https://www.kernel.org/doc/html/v6.7/core-api/dma-api-howto.html},
|
||
journal = {The Linux Kernel},
|
||
author = {Miller, David S and Henderson, Richard and Jelinek, Jakub},
|
||
year = {2024}
|
||
}
|
||
|
||
@book{Nagarajan_etal.Primer_consistency_coherence_arch.2ed.2020,
|
||
title = {A primer on memory consistency and cache coherence},
|
||
author = {Nagarajan, Vijay and Sorin, Daniel J and Hill, Mark D and Wood, David A},
|
||
year = {2020},
|
||
publisher = {Springer Nature}
|
||
}
|
||
|
||
@inproceedings{narayanan2020heterogeneity,
|
||
title = {$\{$Heterogeneity-Aware$\}$ cluster scheduling policies for deep learning workloads},
|
||
author = {Narayanan, Deepak and Santhanam, Keshav and Kazhamiaka, Fiodar and Phanishayee, Amar and Zaharia, Matei},
|
||
year = 2020,
|
||
booktitle = {14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)},
|
||
pages = {481--498}
|
||
}
|
||
|
||
@inproceedings{Nelson_etal.Grappa_DSM.2015,
|
||
title = {$\{$Latency-Tolerant$\}$ software distributed shared memory},
|
||
author = {Nelson, Jacob and Holt, Brandon and Myers, Brandon and Briggs, Preston and Ceze, Luis and Kahan, Simon and Oskin, Mark},
|
||
booktitle = {2015 USENIX Annual Technical Conference (USENIX ATC 15)},
|
||
pages = {291--305},
|
||
year = {2015}
|
||
}
|
||
|
||
@inproceedings{Oh_Kim.Container_Migration.2018,
|
||
title = {Stateful Container Migration employing Checkpoint-based Restoration for Orchestrated Container Clusters},
|
||
author = {Oh, SeungYong and Kim, JongWon},
|
||
year = 2018,
|
||
booktitle = {2018 International Conference on Information and Communication Technology Convergence (ICTC)},
|
||
volume = {},
|
||
number = {},
|
||
pages = {25--30},
|
||
doi = {10.1109/ICTC.2018.8539562}
|
||
}
|
||
|
||
@misc{Parris.AMBA_4_ACE-Lite.2013,
|
||
title = {Extended system coherency: Cache Coherency Fundamentals},
|
||
url = {https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/extended-system-coherency---part-1---cache-coherency-fundamentals},
|
||
journal = {Extended System Coherency: Cache Coherency Fundamentals - Architectures and Processors blog - Arm Community blogs - Arm Community},
|
||
publisher = {ARM Community Blogs},
|
||
author = {Parris, Neil},
|
||
year = {2013}
|
||
}
|
||
|
||
@inproceedings{Pinto_etal.Thymesisflow.2020,
|
||
title = {Thymesisflow: A software-defined, hw/sw co-designed interconnect stack for rack-scale memory disaggregation},
|
||
author = {Pinto, Christian and Syrivelis, Dimitris and Gazzetti, Michele and Koutsovasilis, Panos and Reale, Andrea and Katrinis, Kostas and Hofstee, H Peter},
|
||
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
|
||
pages = {868--880},
|
||
year = {2020},
|
||
organization = {IEEE}
|
||
}
|
||
|
||
@article{Rodriguez_etal.HPC_Cluster_Migration.2019,
|
||
title = {Job migration in hpc clusters by means of checkpoint/restart},
|
||
author = {Rodr{\'\i}guez-Pascual, Manuel and Cao, Jiajun and Mor{\'\i}{\~n}igo, Jos{\'e} A and Cooperman, Gene and Mayo-Garc{\'\i}a, Rafael},
|
||
year = 2019,
|
||
journal = {The Journal of Supercomputing},
|
||
publisher = {Springer},
|
||
volume = 75,
|
||
pages = {6517--6541}
|
||
}
|
||
|
||
@misc{Rust.core::sync::atomic::Ordering.2024,
|
||
title = {Ordering in core::sync::atomic - Rust},
|
||
url = {https://doc.rust-lang.org/core/sync/atomic/enum.Ordering.html},
|
||
journal = {The Rust Core Library},
|
||
publisher = {the Rust Team},
|
||
year = {2024}
|
||
}
|
||
|
||
@article{Schaefer_Li.Shiva.1989,
|
||
title = {Shiva: An operating system transforming a hypercube into a shared-memory machine},
|
||
author = {Li, Kai and Schaefer, Richard},
|
||
year = {1989}
|
||
}
|
||
|
||
@inproceedings{Schoinas_etal.Sirocco.1998,
|
||
title = {Sirocco: Cost-effective fine-grain distributed shared memory},
|
||
author = {Schoinas, Ioannis and Falsafi, Babak and Hill, Mark D and Larus, James R and Wood, David A},
|
||
booktitle = {Proceedings. 1998 International Conference on Parallel Architectures and Compilation Techniques (Cat. No. 98EX192)},
|
||
pages = {40--49},
|
||
year = {1998},
|
||
organization = {IEEE}
|
||
}
|
||
|
||
@inproceedings{Shan_Tsai_Zhang.DSPM.2017,
|
||
title = {Distributed Shared Persistent Memory},
|
||
author = {Shan, Yizhou and Tsai, Shin-Yeh and Zhang, Yiying},
|
||
year = 2017,
|
||
booktitle = {Proceedings of the 2017 Symposium on Cloud Computing},
|
||
location = {Santa Clara, California},
|
||
publisher = {Association for Computing Machinery},
|
||
address = {New York, NY, USA},
|
||
series = {SoCC '17},
|
||
pages = {323–337},
|
||
doi = {10.1145/3127479.3128610},
|
||
isbn = 9781450350280,
|
||
url = {https://doi.org/10.1145/3127479.3128610},
|
||
abstract = {Next-generation non-volatile memories (NVMs) will provide byte addressability, persistence, high density, and DRAM-like performance. They have the potential to benefit many datacenter applications. However, most previous research on NVMs has focused on using them in a single machine environment. It is still unclear how to best utilize them in distributed, datacenter environments.We introduce Distributed Shared Persistent Memory (DSPM), a new framework for using persistent memories in distributed data-center environments. DSPM provides a new abstraction that allows applications to both perform traditional memory load and store instructions and to name, share, and persist their data.We built Hotpot, a kernel-level DSPM system that provides low-latency, transparent memory accesses, data persistence, data reliability, and high availability. The key ideas of Hotpot are to integrate distributed memory caching and data replication techniques and to exploit application hints. We implemented Hotpot in the Linux kernel and demonstrated its benefits by building a distributed graph engine on Hotpot and porting a NoSQL database to Hotpot. Our evaluation shows that Hotpot outperforms a recent distributed shared memory system by 1.3\texttimes{} to 3.2\texttimes{} and a recent distributed PM-based file system by 1.5\texttimes{} to 3.0\texttimes{}.},
|
||
numpages = 15,
|
||
keywords = {distributed shared memory, persistent memory}
|
||
}
|
||
|
||
@misc{Ven.LKML_x86_DMA.2008,
|
||
title = {Background on ioremap, cacheing, cache coherency on x86},
|
||
url = {https://lkml.org/lkml/2008/4/29/480},
|
||
journal = {lkml.org},
|
||
author = {Ven, Arjan van de},
|
||
year = {2008}
|
||
}
|
||
|
||
@inproceedings{Wang_etal.Concordia.2021,
|
||
author = {Qing Wang and Youyou Lu and Erci Xu and Junru Li and Youmin Chen and Jiwu Shu},
|
||
title = {Concordia: Distributed Shared Memory with {In-Network} Cache Coherence},
|
||
booktitle = {19th USENIX Conference on File and Storage Technologies (FAST 21)},
|
||
year = {2021},
|
||
isbn = {978-1-939133-20-5},
|
||
pages = {277--292},
|
||
url = {https://www.usenix.org/conference/fast21/presentation/wang},
|
||
publisher = {USENIX Association},
|
||
month = feb
|
||
}
|
||
|
||
@misc{WEB.Ampere..Ampere_Altra_Datasheet.2023,
|
||
url = {https://uawartifacts.blob.core.windows.net/upload-files/Altra_Max_Rev_A1_DS_v1_15_20230809_b7cdce449e_424d129849.pdf},
|
||
journal = {Ampere Altra Max Rev A1 64-Bit Multi-Core Processor Datasheet},
|
||
publisher = {Ampere Computing}
|
||
}
|
||
|
||
@misc{WEB.APACHE..Apache_Hadoop.2023,
|
||
url = {https://hadoop.apache.org/},
|
||
journal = {Apache Hadoop},
|
||
publisher = {The APACHE Software Foundation}
|
||
}
|
||
|
||
@misc{WEB.APACHE..Apache_Spark.2023,
|
||
url = {https://spark.apache.org/},
|
||
journal = {Apache SparkTM - Unified Engine for large-scale data analytics},
|
||
publisher = {The APACHE Software Foundation}
|
||
}
|
||
|
||
@misc{WEB.HPE.Chapel_Platforms-v1.33.2023,
|
||
title = {Platform-Specifc Notes},
|
||
url = {https://chapel-lang.org/docs/platforms/index.html#},
|
||
journal = {Chapel Documentation 1.33},
|
||
publisher = {Hewlett Packard Enterprise Development LP.},
|
||
year = {2023}
|
||
}
|
||
|
||
@misc{WEB.LBNL.UPC_man_1_upcc.2022,
|
||
title = {upcc.1},
|
||
url = {https://upc.lbl.gov/docs/user/upcc.html},
|
||
journal = {Manual Reference Pages - UPCC (1)},
|
||
publisher = {Lawrence Berkeley National Laboratory},
|
||
year = {2022}
|
||
}
|
||
|
||
@misc{WEB.LWN.Corbet.HMM_GPL_woes.2018,
|
||
title = {Heterogeneous memory management meets EXPORT\_SYMBOL\_GPL()},
|
||
author = {Corbet, Jonathan},
|
||
year = 2018,
|
||
journal = {LWN.net},
|
||
publisher = {LWN.net},
|
||
url = {https://lwn.net/Articles/757124/}
|
||
} or was the order of authors other way around?
|
||
|
||
@misc{WEB.NVIDIA.Harris.Unified_Memory_CUDA.2017,
|
||
title = {Unified memory for cuda beginners},
|
||
author = {Harris, Mark},
|
||
year = 2017,
|
||
journal = {Unified Memory for CUDA Beginners},
|
||
publisher = {NVIDIA},
|
||
url = {https://developer.nvidia.com/blog/unified-memory-cuda-beginners/}
|
||
}
|
||
|
||
@misc{WEB.Phoronix..HMM_Search_Results.2023,
|
||
journal = {Heterogeneous Memory Management - Phoronix},
|
||
publisher = {Phoronix},
|
||
url = {https://www.phoronix.com/search/Heterogeneous%20Memory%20Management}
|
||
}
|
||
|
||
@inproceedings{Werstein_Pethick_Huang.PerfAnalysis_DSM_MPI.2003,
|
||
title = {A performance comparison of dsm, pvm, and mpi},
|
||
author = {Werstein, Paul and Pethick, Mark and Huang, Zhiyi},
|
||
booktitle = {Proceedings of the Fourth International Conference on Parallel and Distributed Computing, Applications and Technologies},
|
||
pages = {476--482},
|
||
year = {2003},
|
||
organization = {IEEE}
|
||
}
|
||
|
||
@inproceedings{Yang_etal.FIFO-LPQD.2023,
|
||
title = {FIFO can be Better than LRU: the Power of Lazy Promotion and Quick Demotion},
|
||
author = {Yang, Juncheng and Qiu, Ziyue and Zhang, Yazhuo and Yue, Yao and Rashmi, KV},
|
||
year = 2023,
|
||
booktitle = {Proceedings of the 19th Workshop on Hot Topics in Operating Systems},
|
||
pages = {70--79}
|
||
}
|
||
|
||
@inproceedings{Zaharia_etal.RDD.2012,
|
||
author = {Matei Zaharia and Mosharaf Chowdhury and Tathagata Das and Ankur Dave and Justin Ma and Murphy McCauly and Michael J. Franklin and Scott Shenker and Ion Stoica},
|
||
title = {Resilient Distributed Datasets: A {Fault-Tolerant} Abstraction for {In-Memory} Cluster Computing},
|
||
booktitle = {9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12)},
|
||
year = {2012},
|
||
isbn = {978-931971-92-8},
|
||
address = {San Jose, CA},
|
||
pages = {15--28},
|
||
url = {https://www.usenix.org/conference/nsdi12/technical-sessions/presentation/zaharia},
|
||
publisher = {USENIX Association},
|
||
month = apr
|
||
}
|
||
|
||
@inproceedings{Zhang_etal.GiantVM.2020,
|
||
title = {Giantvm: A type-ii hypervisor implementing many-to-one virtualization},
|
||
author = {Zhang, Jin and Ding, Zhuocheng and Chen, Yubin and Jia, Xingguo and Yu, Boshi and Qi, Zhengwei and Guan, Haibing},
|
||
booktitle = {Proceedings of the 16th ACM SIGPLAN/SIGOPS International Conference on Virtual Execution Environments},
|
||
pages = {30--44},
|
||
year = {2020}
|
||
}
|
||
|
||
@inproceedings{Zhou_etal.DART-MPI.2014,
|
||
title = {DART-MPI: An MPI-based implementation of a PGAS runtime system},
|
||
author = {Zhou, Huan and Mhedheb, Yousri and Idrees, Kamran and Glass, Colin W and Gracia, Jos{\'e} and F{\"u}rlinger, Karl},
|
||
booktitle = {Proceedings of the 8th International Conference on Partitioned Global Address Space Programming Models},
|
||
pages = {1--11},
|
||
year = {2014}
|
||
}
|
||
|
||
@book{Corbet_Rubini_K-Hartman.LDD3.2005,
|
||
title={Linux device drivers},
|
||
author={Corbet, Jonathan and Rubini, Alessandro and Kroah-Hartman, Greg},
|
||
year={2005},
|
||
publisher={" O'Reilly Media, Inc."}
|
||
}
|
||
|
||
@misc{Rostedt.Kernelv6.7-ftrace.2023,
|
||
title={ftrace - Function Tracer},
|
||
url={https://www.kernel.org/doc/html/v6.7/trace/ftrace.html#dynamic-ftrace},
|
||
journal={The Linux Kernel documentation},
|
||
author={Rostedt, Steven},
|
||
editor={Changbin, Du},
|
||
year={2023}
|
||
}
|
||
|
||
@misc{N/A.Kernelv6.7-libbpf.2023,
|
||
title={libbpf Overview},
|
||
url={https://www.kernel.org/doc/html/v6.7/bpf/libbpf/libbpf_overview.html},
|
||
journal={The Linux Kernel documentation},
|
||
year={2023}
|
||
}
|
||
|
||
@inproceedings{Yang_Izraelevitz_Swanson.FileMR-RDMA.2020,
|
||
title={$\{$FileMR$\}$: Rethinking $\{$RDMA$\}$ Networking for Scalable Persistent Memory},
|
||
author={Yang, Jian and Izraelevitz, Joseph and Swanson, Steven},
|
||
booktitle={17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)},
|
||
pages={111--125},
|
||
year={2020}
|
||
}
|
||
|
||
@misc{N/A.Kernelv6.7-hugetlb.2023,
|
||
title={HugeTLB Pages},
|
||
url={https://www.kernel.org/doc/html/v6.7/admin-guide/mm/hugetlbpage.html},
|
||
journal={The Linux Kernel documentation},
|
||
year={2023}
|
||
}
|
||
|
||
@misc{N/A.Kernelv6.7-arm64-hugetlb.2023,
|
||
title={HugeTLBpage on ARM64},
|
||
url={https://www.kernel.org/doc/html/v6.7/arch/arm64/hugetlbpage.html},
|
||
journal={The Linux Kernel documentation},
|
||
year={2023}
|
||
} |