\documentclass{beamer} \usepackage[]{biblatex} \usepackage[export]{adjustbox} \title{ Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM } \author{Zhengyi Chen} \date{\today} \addbibresource{../main.bib} \begin{document} % Title Page \frame{\titlepage} % Table of Content \begin{frame} \frametitle{Table of Contents} \tableofcontents \end{frame} % Part 1: Overview % ============================================================================= \section{1. Overview} % Page 1 \begin{frame} \frametitle{1. Overview} \begin{itemize} \item { DSM used to be constrained by NIC bandwidth \& transfer rate (e.g., during the 1990s). } \item { The advent of high(er) transfer rate NICs allows the DSM idea to be revived. } \item { Orthogonally, hardware acceleration resources are scarce and highly valuable. \begin{itemize} \item { Traditional Scheduling Mechanisms within a Cluster cannot dynamically allocate hardware accelerators without high overhead. } \end{itemize} } \item { Ideally, via high-speed NICs, hardware accelerator could be statically allocated such that: \begin{itemize} \item { Every node have access to the hardware accelerator node in a time-shared fashion. } \item { Accelerator-attached node can access remote memory much like attaching accelerator over, say, PCIe. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Heterogeneous Memory Management} \begin{itemize} \item { \textbf{HMM} facilitates shared address space and transparent data migration between CPU and peripherals. Specifically: \begin{itemize} \item { HMM provides interface for duplicating the CPU page table with that of the device's, which are transparently synchronized. } \item { It also provides corresponding \texttt{struct page} representation of device memory pages, which are faulted between the CPU and device. } \end{itemize} } \item { Theoretically, this should allow for devices in remote nodes to perform HMM using the DMA-capable NIC as a ``proxy HMM device''. } \item { Details of implementation of DSM-over-HMM is beyond this thesis's scope. \begin{itemize} \item { This thesis focuses on studying and implementing cache coherency and later, memory model for the DSM subsystem of this wider, ongoing project. } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Cache Coherency, and Why It Matters Here} \begin{itemize} \item { Cache-incoherent RDMA (e.g., mlx) performs DMA without synchronization with CPU cache. } \item { We cannot assume MMU to magically automatically maintain coherence. } \item { At transportation time: \begin{itemize} \item { Send to remote: flushes cache into memory before posting send message. } \item { Receive from remote: invalidate cache entry after worked recv message. } \end{itemize} } \item { Example: Linux kernel tree, \textit{smbdirect} implementation. \begin{itemize} \item { \textit{smbdirect} opportunistically establish SMB over RDMA-capable network. } \item { \texttt{smbd\_post\_send} cleans cache entry prior to posting send request. } \item { \texttt{recv\_done} invalidates cache entry after exiting softirq for recv request (as callback from RDMA driver). } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Consistency Model and Protocol} \begin{itemize} \item { Majority of DSM literatures apply \textbf{release consistency} as the system's memory model. } \item { With \textbf{single-writer} protocol, however, the memory model can be strengthened with little increase in code complexity. \begin{itemize} \item { \textit{DSPM}\cite{shan2017distributed}, for example, achieves a \textit{de-facto} TSO consistency from its multi-writer release consistency counterpart -- assuming correct memory barriers within each node's CPU, distributed writes are never reordered, and distributed reads can overtake writes. } \item { Consequently, one can easily achieve sequential consistency by designating the entire write-access duration as a critical section. } \end{itemize} } \item { HMM's ``CPU-or-device'' data migration model also strongly implies a single-writer consistency protocol. } \end{itemize} \end{frame} % Part 2: Design % ============================================================================= \section{2. Design} \begin{frame} \frametitle{2. Design} \begin{itemize} \item { Designing a DSM necessitates designing: \begin{itemize} \item Consistency Model. \item Coherence Protocol and State Machine. \item Access Control. \end{itemize} } \item { Care needs to be taken to ensure that the in-kernel implementation is: \begin{itemize} \item Correct, \item Performant, \item Exploits RDMA's traits. \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Consistency Model} \end{frame} \begin{frame} \frametitle{Coherence Protocol} \end{frame} \begin{frame} \frametitle{Stateful Nodes} \end{frame} % Part 3: Progress % ============================================================================= \section{3. Progress} \begin{frame} \frametitle{Progress} \end{frame} \begin{frame} \frametitle{On-demand Coherency in ARM64} \begin{itemize} \item { ARMv8 defines two levels of cache coherence: \begin{itemize} \item { \textit{Point-of-Unification}: } \item { \textit{Point-of-Coherence}: } \end{itemize} } \end{itemize} \end{frame} \begin{frame} \frametitle{Kernel Patch for On-demand Coherency} \end{frame} \begin{frame} \frametitle{Proof-of-Concept Kernel Module} \end{frame} % References \end{document}