\documentclass{beamer} \usepackage[]{biblatex} \usepackage[export]{adjustbox} \title{ Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM } \author{Zhengyi Chen} \date{\today} \addbibresource{../main.bib} \begin{document} % Title Page \frame{\titlepage} % Table of Content \begin{frame} \frametitle{Table of Contents} \tableofcontents \end{frame} % Part 1: Overview % ============================================================================= \section{1. Overview} % Page 1 \begin{frame} \frametitle{1. Overview} \begin{itemize} \item { DSM used to be constrained by NIC bandwidth \& transfer rate (e.g., during the 1990s). } \item { The advent of high(er) transfer rate NICs allows the DSM idea to be revived. } \item { Orthogonally, hardware acceleration resources are scarce and highly valuable. \begin{itemize} \item { Traditional Scheduling Mechanisms within a Cluster cannot dynamically allocate hardware accelerators without high overhead. } \end{itemize} } \item { Ideally, via high-speed NICs, hardware accelerator could be statically allocated such that: \begin{itemize} \item { Every node have access to the hardware accelerator node in a time-shared fashion. } \item { Accelerator-attached node can access remote memory much like attaching accelerator over, say, PCIe. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Heterogeneous Memory Management} \begin{itemize} \item { \textbf{HMM} facilitates shared address space and transparent data migration between CPU and peripherals. Specifically: \begin{itemize} \item { HMM provides interface for duplicating the CPU page table with that of the device's, which are transparently synchronized. } \item { It also provides corresponding \texttt{struct page} representation of device memory pages, which are faulted between the CPU and device. } \end{itemize} } \item { Theoretically, this should allow for devices in remote nodes to perform HMM using the DMA-capable NIC as a ``proxy HMM device''. } \item { Details of implementation of DSM-over-HMM is beyond this thesis's scope. \begin{itemize} \item { This thesis focuses on studying and implementing cache coherency and later, memory model for the DSM subsystem of this wider, ongoing project. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Cache Coherency, and Why It Matters Here} \begin{itemize} \item { Cache-incoherent RDMA (e.g., mlx) performs DMA without synchronization with CPU cache. } \item { We cannot assume MMU to magically automatically maintain coherence. } \item { At transportation time: \begin{itemize} \item { Send to remote: flushes cache into memory before posting send message. } \item { Receive from remote: invalidate cache entry after worked recv message. } \end{itemize} } \item { Example: Linux kernel tree, \textit{smbdirect} implementation. \begin{itemize} \item { \textit{smbdirect} opportunistically establish SMB over RDMA-capable network. } \item { \texttt{smbd\_post\_send} cleans cache entry prior to posting send request. } \item { \texttt{recv\_done} invalidates cache entry after exiting softirq for recv request (as callback from RDMA driver). } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Consistency Model and Protocol} \begin{itemize} \item { Majority of DSM literatures apply \textbf{release consistency} as the system's memory model. } \item { With \textbf{single-writer} protocol, however, the memory model can be strengthened with little increase in code complexity. \begin{itemize} \item { \textit{DSPM}\cite{shan2017distributed}, for example, achieves a \textit{de-facto} TSO consistency from its multi-writer release consistency counterpart -- assuming correct memory barriers within each node's CPU, distributed writes are never reordered, and distributed reads can overtake writes. } \item { Consequently, one can easily achieve sequential consistency by designating the entire write-access duration as a critical section. } \end{itemize} } \item { HMM's ``CPU-or-device'' data migration model also strongly implies a single-writer consistency protocol. } \end{itemize} \end{frame} % Part 2: Design % ============================================================================= \section{2. Design} \begin{frame} \frametitle{2. Design} \begin{itemize} \item { Designing a DSM necessitates designing: \begin{itemize} \item Consistency Model. \item Coherence Protocol and State Machine. \item Access Control. \end{itemize} } \item { Care needs to be taken to ensure that the in-kernel implementation is: \begin{itemize} \item Correct, \item Performant, \item Exploits RDMA's traits. \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Protocol Excerpt: Write-Invalidate} \begin{figure} \centering \includegraphics[width=\linewidth]{ w12_slides_resources/Fig-RwlockProtocol 2023-12-06 19_05_06.pdf } \end{figure} The \textit{T}-state indicates a transitionary state for some shared page. \end{frame} \begin{frame} \frametitle{Consistency Model: TSO} \begin{itemize} \item { Total Store Ordering allows Reads to bypass Stores. } \item { Assuming correct use of node-local synchronization on all nodes, applying TSO in a home-based DSM allows for: \begin{itemize} \item { When another node tries to read T-page from access-control node: W$\rightarrow$R violation. } \item { When another node tries to read S-page from data-provider nodes: W$\rightarrow$R violation (if e.g., the invalidation message from access-control node was received afterwards). } \item { Data-provider and access-control nodes work on one request at a time: no R$\rightarrow$W violation. } \item { Write-accesses serialized at access-control node: no W$\rightarrow$W violation. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Consistency Model: Strengthen to Sequential} \begin{itemize} \item { By corollary, can reverse the previous page's statements to strengthen to sequential consistency: \begin{itemize} \item { Disallow T-pages from being serviced until new page content is installed: lengthens critical section. } \item { Abolish data-provider nodes: access-control nodes become bottleneck. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Coherence Protocol: Possible Features} \begin{itemize} \item { Multi-data-provider Protocol: Instead of having one data-provider, have multiple data-provider nodes that are automatically write-back to prevent network bottleneck. \begin{itemize} \item Data provider nodes may be dynamically assigned. \item Extra metadata can limit scalability. \end{itemize} } \item { Auto-share: likewise, write-back pages to non-data-provider nodes to take advantage of 1-sided communications. } \item { Request aggregation: aggregate RDMA transfers for optimal transfer performance. \begin{itemize} \item Need to be coherent with program sequence! \item Enables write-request merging. \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Stateful Nodes \& Transitions (Provisional)} \begin{itemize} \item { Nodes (e.g., within the cluster) become tightly bound with the properties of each shared page(s). } \end{itemize} \begin{figure} \centering \includegraphics[width=\linewidth]{ w15_resources/截屏 2024-01-30 19.15.45 2024-01-30 19_16_19.png } \end{figure} \end{frame} \begin{frame} \frametitle{Stateful Nodes \& Transitions (Provisional) (Cont.)} \begin{itemize} \item { MN (Manager Nodes): Provide access-control and (fallback) data-provision. } \item { HN (Home Nodes): Provide data-provision. Can be write-back or write-invalidate. } \item { SN (Sharer Nodes): Share data within a reader-only ``epoch''. Can be write-back or write-invalidate. } \item { NSN (Non-sharer Nodes): Nodes in network without sharing the particular page(s). } \item { CN (Commit Node): Node that acquired the single-writer access to the shared page. } \item { Message variants are not finalized: \begin{itemize} \item { Goal: Composable message chains that allow for ``piggy-backing'' of multiple procedures. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Stateful Nodes: Transition Paths} \begin{itemize} \item { Filled line transitions indicate local requests remote to perform state transition. } \item { Dashed line transitions indicate local implicitly transitions prior to sending request to remote. } \item { \textit{Non-committal} path concerns about read-only and copy-on-write sharing. Sharers cannot make global modification to cached local data. } \item { \textit{Invalidation} path is duo with commit operations (due to write-invalidation). } \item { \textit{Committal} path concerns about global write sharing. Only one writer is allowed to write and commit at one time. } \item { Problem: How exactly to integrate RDMA remote read/write into this? } \end{itemize} \end{frame} % Part 3: Progress % ============================================================================= \section{3. Progress} \begin{frame} \frametitle{Progress} \begin{itemize} \item { Goal: in-kernel implementation of software cache-coherency via non-coherent RDMA hardware. } \item { Optimistic Goal: in-kernel implementation of memory model in DSM. } \item { Progress: studied and isolated mechanism for data cache invalidation/flushing in ARM64, which allows the DSM to run in heterogeneous ISA clusters. } \item { Integration with kernel \& main DSM kernel module remains at hand: is it absolutely necessary to export new symbols for such an important operation? } \end{itemize} \end{frame} \begin{frame} \frametitle{On-demand Coherency in ARM64} \begin{itemize} \item { ARMv8 defines two levels of cache coherence: \begin{itemize} \item { \textit{Point-of-Unification}: } \item { \textit{Point-of-Coherence}: } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Kernel Patch for On-demand Coherency} \end{frame} \begin{frame} \frametitle{Proof-of-Concept Kernel Module} \end{frame} % Part 4: Future Work % ============================================================================= % References \end{document}