\documentclass{beamer} \usepackage[]{biblatex} \usepackage[export]{adjustbox} \usepackage{hyperref} \title{ Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM } \author{Zhengyi Chen} \date{\today} \addbibresource{../main.bib} \begin{document} % Title Page \frame{\titlepage} % Table of Content \begin{frame} \frametitle{Table of Contents} \tableofcontents \end{frame} % Part 1: Overview % ============================================================================= \section{1. Overview} % Page 1 \begin{frame} \frametitle{1. Overview} \begin{itemize} \item { DSM used to be constrained by NIC bandwidth \& transfer rate (e.g., during the 1990s). } \item { The advent of high(er) transfer rate NICs allows the DSM idea to be revived. } \item { Orthogonally, hardware acceleration resources are scarce and highly valuable. \begin{itemize} \item { Traditional Scheduling Mechanisms within a Cluster cannot dynamically allocate hardware accelerators without high overhead. } \end{itemize} } \item { Ideally, via high-speed NICs, hardware accelerator could be statically allocated such that: \begin{itemize} \item { Every node have access to the hardware accelerator node in a time-shared fashion. } \item { Accelerator-attached node can access remote memory much like attaching accelerator over, say, PCIe. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Heterogeneous Memory Management} \begin{itemize} \item { \textbf{HMM} facilitates shared address space and transparent data migration between CPU and peripherals. Specifically: \begin{itemize} \item { HMM provides interface for duplicating the CPU page table with that of the device's, which are transparently synchronized. } \item { It also provides corresponding \texttt{struct page} representation of device memory pages, which are faulted between the CPU and device. } \end{itemize} } \item { Theoretically, this should allow for devices in remote nodes to perform HMM using the DMA-capable NIC as a ``proxy HMM device''. } \item { Details of implementation of DSM-over-HMM is beyond this thesis's scope. \begin{itemize} \item { This thesis focuses on studying and implementing cache coherency and later, memory model for the DSM subsystem of this wider, ongoing project. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Cache Coherency, and Why It Matters Here} \begin{itemize} \item { Cache-incoherent RDMA (e.g., mlx) performs DMA without synchronization with CPU cache. } \item { We cannot assume MMU to magically maintain coherence. \begin{itemize} \item { This seems the case for x86\_64 (cache-coherent DMA), but not ARM64. } \end{itemize} } \item { At transportation time: \begin{itemize} \item { Send to remote: flushes cache into memory before posting send message. } \item { Receive from remote: invalidate cache entry after worked recv message. } \end{itemize} } \item { Example: Linux kernel tree, \textit{smbdirect} implementation. \begin{itemize} \item { \textit{smbdirect} opportunistically establish SMB over RDMA-capable network. } \item { \texttt{smbd\_post\_send} cleans cache entry prior to posting send request. } \item { \texttt{recv\_done} invalidates cache entry after exiting softirq for recv request (as callback from RDMA driver). } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Consistency Model and Protocol} \begin{itemize} \item { Majority of DSM literatures apply \textbf{release consistency} as the system's memory model. } \item { With \textbf{single-writer} protocol, however, the memory model can be strengthened with little increase in code complexity. \begin{itemize} \item { \textit{DSPM}\cite{shan2017distributed}, for example, achieves a \textit{de-facto} TSO consistency from its multi-writer release consistency counterpart -- assuming correct memory barriers within each node's CPU, distributed writes are never reordered, and distributed reads can overtake writes. } \item { Consequently, one can easily achieve sequential consistency by designating the entire write-access duration as a critical section. } \end{itemize} } \item { HMM's ``CPU-or-device'' data migration model also strongly implies a single-writer consistency protocol. } \end{itemize} \end{frame} % Part 2: Design % ============================================================================= \section{2. Design} \begin{frame} \frametitle{2. Design} \begin{itemize} \item { Designing a DSM necessitates designing: \begin{itemize} \item Consistency Model. \item Coherence Protocol and State Machine. \item Access Control. \end{itemize} } \item { Care needs to be taken to ensure that the in-kernel implementation is: \begin{itemize} \item Correct, \item Performant, \item Exploits RDMA's traits. \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Protocol Overview} \begin{itemize} \item { Multiple readers can exist for a clean page -- the page is \textbf{shared}. } \item { Only one writer is allowed for a clean page -- the page becomes \textbf{exclusive}. } \item { For one writer node be allowed sole write access to some page, all other sharers need to have their page cache invalidated prior to making the change global (commit-invalidate). } \item { While the sole writer node has not yet committed, either: \begin{itemize} \item { no other reader or writer nodes are allowed to be served this page (stronger consistency model). } \item { no writers are allowed to be served this page. Readers can be served stale data (provided data providers do not receive invalidation message prior to service). } \end{itemize} } \item { When the sole writer commits, it becomes the sole home node (data provider) which serves the updated page content. \begin{itemize} \item { Optionally, some nodes can register to have commits written back instead. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Protocol Excerpt: Write-Invalidate} \begin{figure} \centering \includegraphics[width=\linewidth]{ w12_slides_resources/Fig-RwlockProtocol 2023-12-06 19_05_06.pdf } \end{figure} The \textit{T}-state indicates a transitionary state for some shared page. \end{frame} \begin{frame} \frametitle{Consistency Model: TSO} \begin{itemize} \item { Total Store Ordering allows Reads to overtake Stores. } \item { Assuming correct use of node-local synchronization on all nodes, applying TSO in a home-based DSM allows for: \begin{itemize} \item { Another node tries to read T-page from access-control node, served stale data: W$\rightarrow$R violation. } \item { Another node tries to read S-page from data-provider nodes, served stale data: W$\rightarrow$R violation (if e.g., the invalidation message from access-control node was received afterwards). } \item { Data-provider and access-control nodes work on one request at a time: no R$\rightarrow$W violation. } \item { Write-accesses serialized at access-control node: no W$\rightarrow$W violation. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Consistency Model: Strengthen to Sequential} \begin{itemize} \item { By corollary, can reverse the previous page's statements to strengthen to sequential consistency: \begin{itemize} \item { Disallow T-pages from being serviced until new page content is installed: lengthens critical section. } \item { Abolish data-provider nodes: access-control nodes become bottleneck. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Coherence Protocol: Possible Features} \begin{itemize} \item { Multi-data-provider Protocol: Instead of having one data-provider, have multiple data-provider nodes that are automatically write-back to prevent network bottleneck. \begin{itemize} \item Data provider nodes may be dynamically assigned. \item Extra metadata can limit scalability. \end{itemize} } \item { Auto-share: likewise, write-back pages to non-data-provider nodes, which takes advantage of 1-sided communications provided by RDMA. } \item { Request aggregation: aggregate RDMA transfers for optimal transfer performance. \begin{itemize} \item Need to be coherent with program sequence! \item Enables write-request merging. \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Stateful Nodes \& Transitions (Provisional)} \begin{itemize} \item { Nodes (e.g., within the cluster) become tightly bound with the properties of each shared page(s). } \end{itemize} \begin{figure} \centering \includegraphics[width=\linewidth]{ w15_resources/截屏 2024-01-30 19.15.45 2024-01-30 19_16_19.png } \end{figure} \end{frame} \begin{frame} \frametitle{Stateful Nodes \& Transitions (Provisional) (Cont.)} \begin{itemize} \item { MN (Manager Nodes): Provide access-control and (fallback) data-provision. } \item { HN (Home Nodes): Provide data-provision. Can be write-back or write-invalidate. } \item { SN (Sharer Nodes): Share data within a reader-only ``epoch''. Can be write-back or write-invalidate. } \item { NSN (Non-sharer Nodes): Nodes in network without sharing the particular page(s). } \item { CN (Commit Node): Node that acquired the single-writer access to the shared page. } \item { Problem: Message variants are not finalized: \begin{itemize} \item { Goal: Composable message chains that allow for ``piggy-backing'' of multiple procedures. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Stateful Nodes: Transition Paths} \begin{itemize} \item { Filled line transitions indicate local requests remote to perform state transition. } \item { Dashed line transitions indicate local implicitly transitions prior to sending request to remote. } \item { \textit{Non-committal} path concerns about read-only and copy-on-write sharing. Sharers cannot make global modification to cached local data. } \item { \textit{Invalidation} path is duo with commit operations (due to write-invalidation). } \item { \textit{Committal} path concerns about global write sharing. Only one writer is allowed to write and commit at one time. } \item { Problem: How exactly to integrate RDMA remote read/write into this? } \end{itemize} \end{frame} % Part 3: Progress % ============================================================================= \section{3. Progress} \begin{frame} \frametitle{3. Progress} \begin{itemize} \item { Goal: in-kernel implementation of software cache-coherency via non-coherent RDMA hardware. } \item { Optimistic Goal: in-kernel implementation of memory model in DSM. } \item { Progress: studied and isolated mechanism for data cache invalidation/flushing in ARM64, which allows the DSM to run in heterogeneous ISA clusters. } \item { Integration with kernel \& main DSM kernel module remains at hand: is it absolutely necessary to export new symbols for such an important operation? } \item { Repository: \url{https://github.com/rubberhead/unnamed_ba_thesis.git}. } \end{itemize} \end{frame} \begin{frame} \frametitle{On-demand Coherency in ARM64} \begin{itemize} \item { ARMv8 defines two levels of cache coherence: \begin{itemize} \item { \textit{Point-of-Unification}: Within a core, instruction cache, data cache, and TLB all agree in the copy seen for a particular address. \begin{itemize} \item Notably, changing PTE requires PoU. \end{itemize} } \item { \textit{Point-of-Coherence}: Between all DMA-capable peripherals (CPU or otherwise), they all agree in the copy seen for a particular address. } \end{itemize} For this thesis's purposes, strive for PoC. } \item { Operations to achieve the latter are encapsulated in the Linux kernel as \texttt{(d|i)cache\_(clean|inval)\_poc}. \begin{itemize} \item Declared under \texttt{arch/arm64/include/asm/cacheflush.h}. \item Defined in \texttt{arch/arm64/mm/cache.S}. \item { Takes virtual address wrt. \textit{current} address space to writeback/invalidate cache entries. } \item { Problem: Can only be called in process context (for user virtual addresses) or in all contexts (for kernel virtual addresses)? } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Kernel Patch for On-demand Coherency} \begin{itemize} \item { Problem: These symbols are not exported -- not intended for driver use. } \item { Temporary solution: re-export them via patching the kernel. \begin{itemize} \item Note: Kernel version v6.7.0 \item { Longish-term solution: arrange kernel module code in a way that takes advantage of existing driver API (e.g., via DMA API, which for example \textit{smbdirect} uses). } \end{itemize} } \item { Implements wrapper function \texttt{\_\_dcache\_clean\_poc} to re-export \texttt{dcache\_clean\_poc} into driver namespace. } \item { Exports symbol into separate header file. \begin{itemize} \item { Declared in \texttt{arch/arm64/include/asm/cacheflush\_extra.h}. } \item Defined in \texttt{arch/arm64/mm/flush.c}. \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Proof-of-Concept Kernel Module} \begin{itemize} \item { Dynamically allocates \texttt{GFP\_USER} pages and remaps to userspace on \texttt{mmap}. \begin{itemize} \item { \texttt{GFP\_USER} so (for convenience) pages can be directly addressable in kernelspace (via kernel page table). } \item { Pages are lazily allocated and shared between multiple processes (i.e., user address spaces). } \item { Exposed as character device \texttt{/dev/my\_shmem}. } \end{itemize} } \item Around 300+ LoC. \item { Problem: flawed premise for testing cache writeback! \begin{itemize} \item { Summary: CPU datapath differs from DMA datapath, common cache coherency maintenance operations are already performed in common file/virtual memory area operation code. } \item { Idea: perform cache write-back on \texttt{vm\_ops->close}. } \item { Reality: virtual memory area already cleaned from cache and removed from address space prior to calling \texttt{vm\_ops->close}. } \item { Fix: Implement custom \texttt{ioctl}? } \end{itemize} } \end{itemize} \end{frame} % Part 4: Future Work % ============================================================================= \section{4. Future Work} \begin{frame} \frametitle{4. Future Work} \begin{itemize} \item { TBD: \begin{enumerate} \item { Incorporate cache coherence mechanism into the larger project. } \item { Implement memory model within the larger project. This involves: \begin{itemize} \item { Making adjustment to message type and structure specifications for better inter-operation with RDMA. } \item { Implement memory model programmatically. } \end{itemize} } \end{enumerate} } \item { Further Studies: \begin{enumerate} \item { Swappable RDMA memory region. \begin{itemize} \item { As of now, all DMA pages are non-swappable -- they must be allocated using the SLAB/SLUB allocator for kernel memory, or via GFP page allocators. } \end{itemize} } \item { Automatic frequent sharer detection for MUX-ing between commit-invalidation and commit-writeback. } \end{enumerate} } \end{itemize} \end{frame} % References \begin{frame} \frametitle{References} \printbibliography \end{frame} \end{document}