diff --git a/tex/misc/w15_slides.pdf b/tex/misc/w15_slides.pdf new file mode 100644 index 0000000..16cd94f Binary files /dev/null and b/tex/misc/w15_slides.pdf differ diff --git a/tex/misc/w15_slides.tex b/tex/misc/w15_slides.tex new file mode 100644 index 0000000..66bb33a --- /dev/null +++ b/tex/misc/w15_slides.tex @@ -0,0 +1,270 @@ +\documentclass{beamer} +\usepackage[]{biblatex} +\usepackage[export]{adjustbox} + +\title{ + Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM +} +\author{Zhengyi Chen} +\date{\today} + +\addbibresource{../main.bib} + +\begin{document} +% Title Page +\frame{\titlepage} + +% Table of Content +\begin{frame} + \frametitle{Table of Contents} + \tableofcontents +\end{frame} + +% Part 1: Overview +% ============================================================================= +\section{1. Overview} +% Page 1 +\begin{frame} + \frametitle{1. Overview} + \begin{itemize} + \item { + DSM used to be constrained by NIC bandwidth \& transfer rate (e.g., + during the 1990s). + } + \item { + The advent of high(er) transfer rate NICs allows the DSM idea to be + revived. + } + \item { + Orthogonally, hardware acceleration resources are scarce and highly + valuable. + \begin{itemize} + \item { + Traditional Scheduling Mechanisms within a Cluster cannot + dynamically allocate hardware accelerators without high + overhead. + } + \end{itemize} + } + \item { + Ideally, via high-speed NICs, hardware accelerator could be + statically allocated such that: + \begin{itemize} + \item { + Every node have access to the hardware accelerator node in a + time-shared fashion. + } + \item { + Accelerator-attached node can access remote memory much like + attaching accelerator over, say, PCIe. + } + \end{itemize} + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Heterogeneous Memory Management} + \begin{itemize} + \item { + \textbf{HMM} facilitates shared address space and transparent data + migration between CPU and peripherals. Specifically: + \begin{itemize} + \item { + HMM provides interface for duplicating the CPU page table + with that of the device's, which are transparently + synchronized. + } + \item { + It also provides corresponding \texttt{struct page} + representation of device memory pages, which are faulted + between the CPU and device. + } + \end{itemize} + } + \item { + Theoretically, this should allow for devices in remote nodes to + perform HMM using the DMA-capable NIC as a ``proxy HMM device''. + } + \item { + Details of implementation of DSM-over-HMM is beyond this thesis's + scope. + \begin{itemize} + \item { + This thesis focuses on studying and implementing cache + coherency and later, memory model for the DSM subsystem of + this wider, ongoing project. + } + \end{itemize} + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Cache Coherency, and Why It Matters Here} + \begin{itemize} + \item { + Cache-incoherent RDMA (e.g., mlx) performs DMA without + synchronization with CPU cache. + } + \item { + We cannot assume MMU to magically automatically maintain coherence. + } + \item { + At transportation time: + \begin{itemize} + \item { + Send to remote: flushes cache into memory before posting + send message. + } + \item { + Receive from remote: invalidate cache entry after worked + recv message. + } + \end{itemize} + } + \item { + Example: Linux kernel tree, \textit{smbdirect} implementation. + \begin{itemize} + \item { + \textit{smbdirect} opportunistically establish SMB over + RDMA-capable network. + } + \item { + \texttt{smbd\_post\_send} cleans cache entry prior to posting + send request. + } + \item { + \texttt{recv\_done} invalidates cache entry after exiting + softirq for recv request (as callback from RDMA driver). + } + \end{itemize} + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Consistency Model and Protocol} + \begin{itemize} + \item { + Majority of DSM literatures apply \textbf{release consistency} as + the system's memory model. + } + \item { + With \textbf{single-writer} protocol, however, the memory model can + be strengthened with little increase in code complexity. + \begin{itemize} + \item { + \textit{DSPM}\cite{shan2017distributed}, for example, + achieves a \textit{de-facto} TSO consistency from its + multi-writer release consistency counterpart -- assuming + correct memory barriers within each node's CPU, distributed + writes are never reordered, and distributed reads can + overtake writes. + } + \item { + Consequently, one can easily achieve sequential consistency + by designating the entire write-access duration as a critical + section. + } + \end{itemize} + } + \item { + HMM's ``CPU-or-device'' data migration model also strongly implies + a single-writer consistency protocol. + } + \end{itemize} +\end{frame} + +% Part 2: Design +% ============================================================================= +\section{2. Design} + +\begin{frame} + \frametitle{2. Design} + \begin{itemize} + \item { + Designing a DSM necessitates designing: + \begin{itemize} + \item Consistency Model. + \item Coherence Protocol and State Machine. + \item Access Control. + \end{itemize} + } + \item { + Care needs to be taken to ensure that the in-kernel implementation + is: + \begin{itemize} + \item Correct, + \item Performant, + \item Exploits RDMA's traits. + \end{itemize} + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Consistency Model} + + + +\end{frame} + +\begin{frame} + \frametitle{Coherence Protocol} + + + +\end{frame} + +\begin{frame} + \frametitle{Stateful Nodes} + + + +\end{frame} + +% Part 3: Progress +% ============================================================================= +\section{3. Progress} + +\begin{frame} + \frametitle{Progress} + + + +\end{frame} + +\begin{frame} + \frametitle{On-demand Coherency in ARM64} + \begin{itemize} + \item { + ARMv8 defines two levels of cache coherence: + \begin{itemize} + \item { + \textit{Point-of-Unification}: + } + \item { + \textit{Point-of-Coherence}: + } + \end{itemize} + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Kernel Patch for On-demand Coherency} + + + +\end{frame} + +\begin{frame} + \frametitle{Proof-of-Concept Kernel Module} + + + +\end{frame} + +% References +\end{document} \ No newline at end of file