preparing for feb 1st

2024-01-29 21:19:03 +00:00 · 2024-01-29 21:19:03 +00:00 · 169382407d
commit 169382407d
parent 6c5b3626e1
2 changed files with 270 additions and 0 deletions
--- a/tex/misc/w15_slides.pdf
+++ b/tex/misc/w15_slides.pdf
--- a/tex/misc/w15_slides.tex
+++ b/tex/misc/w15_slides.tex
@ -0,0 +1,270 @@
+\documentclass{beamer}
+\usepackage[]{biblatex}
+\usepackage[export]{adjustbox}
+
+\title{
+    Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM
+}
+\author{Zhengyi Chen}
+\date{\today}
+
+\addbibresource{../main.bib}
+
+\begin{document}
+% Title Page
+\frame{\titlepage}
+
+% Table of Content
+\begin{frame}
+    \frametitle{Table of Contents}
+    \tableofcontents
+\end{frame}
+
+% Part 1: Overview
+% =============================================================================
+\section{1. Overview}
+% Page 1
+\begin{frame}
+    \frametitle{1. Overview}
+    \begin{itemize}
+        \item {
+            DSM used to be constrained by NIC bandwidth \& transfer rate (e.g.,
+            during the 1990s).
+        }
+        \item {
+            The advent of high(er) transfer rate NICs allows the DSM idea to be
+            revived.
+        }
+        \item {
+            Orthogonally, hardware acceleration resources are scarce and highly
+            valuable.
+            \begin{itemize}
+                \item {
+                    Traditional Scheduling Mechanisms within a Cluster cannot
+                    dynamically allocate hardware accelerators without high
+                    overhead.
+                }
+            \end{itemize}
+        }
+        \item {
+            Ideally, via high-speed NICs, hardware accelerator could be
+            statically allocated such that:
+            \begin{itemize}
+                \item {
+                    Every node have access to the hardware accelerator node in a
+                    time-shared fashion.
+                }
+                \item {
+                    Accelerator-attached node can access remote memory much like
+                    attaching accelerator over, say, PCIe.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Heterogeneous Memory Management}
+    \begin{itemize}
+        \item {
+            \textbf{HMM} facilitates shared address space and transparent data
+            migration between CPU and peripherals. Specifically:
+            \begin{itemize}
+                \item {
+                    HMM provides interface for duplicating the CPU page table
+                    with that of the device's, which are transparently
+                    synchronized.
+                }
+                \item {
+                    It also provides corresponding \texttt{struct page}
+                    representation of device memory pages, which are faulted
+                    between the CPU and device.
+                }
+            \end{itemize}
+        }
+        \item {
+            Theoretically, this should allow for devices in remote nodes to
+            perform HMM using the DMA-capable NIC as a ``proxy HMM device''.
+        }
+        \item {
+            Details of implementation of DSM-over-HMM is beyond this thesis's
+            scope.
+            \begin{itemize}
+                \item {
+                    This thesis focuses on studying and implementing cache
+                    coherency and later, memory model for the DSM subsystem of
+                    this wider, ongoing project.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Cache Coherency, and Why It Matters Here}
+    \begin{itemize}
+        \item {
+            Cache-incoherent RDMA (e.g., mlx) performs DMA without
+            synchronization with CPU cache.
+        }
+        \item {
+            We cannot assume MMU to magically automatically maintain coherence.
+        }
+        \item {
+            At transportation time:
+            \begin{itemize}
+                \item {
+                    Send to remote: flushes cache into memory before posting
+                    send message.
+                }
+                \item {
+                    Receive from remote: invalidate cache entry after worked
+                    recv message.
+                }
+            \end{itemize}
+        }
+        \item {
+            Example: Linux kernel tree, \textit{smbdirect} implementation.
+            \begin{itemize}
+                \item {
+                    \textit{smbdirect} opportunistically establish SMB over
+                    RDMA-capable network.
+                }
+                \item {
+                    \texttt{smbd\_post\_send} cleans cache entry prior to posting
+                    send request.
+                }
+                \item {
+                    \texttt{recv\_done} invalidates cache entry after exiting
+                    softirq for recv request (as callback from RDMA driver).
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Consistency Model and Protocol}
+    \begin{itemize}
+        \item {
+            Majority of DSM literatures apply \textbf{release consistency} as
+            the system's memory model.
+        }
+        \item {
+            With \textbf{single-writer} protocol, however, the memory model can
+            be strengthened with little increase in code complexity.
+            \begin{itemize}
+                \item {
+                    \textit{DSPM}\cite{shan2017distributed}, for example,
+                    achieves a \textit{de-facto} TSO consistency from its
+                    multi-writer release consistency counterpart -- assuming
+                    correct memory barriers within each node's CPU, distributed
+                    writes are never reordered, and distributed reads can
+                    overtake writes.
+                }
+                \item {
+                    Consequently, one can easily achieve sequential consistency
+                    by designating the entire write-access duration as a critical
+                    section.
+                }
+            \end{itemize}
+        }
+        \item {
+            HMM's ``CPU-or-device'' data migration model also strongly implies
+            a single-writer consistency protocol.
+        }
+    \end{itemize}
+\end{frame}
+
+% Part 2: Design
+% =============================================================================
+\section{2. Design}
+
+\begin{frame}
+    \frametitle{2. Design}
+    \begin{itemize}
+        \item {
+            Designing a DSM necessitates designing:
+            \begin{itemize}
+                \item Consistency Model.
+                \item Coherence Protocol and State Machine.
+                \item Access Control.
+            \end{itemize}
+        }
+        \item {
+            Care needs to be taken to ensure that the in-kernel implementation
+            is:
+            \begin{itemize}
+                \item Correct,
+                \item Performant,
+                \item Exploits RDMA's traits.
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Consistency Model}
+
+
+
+\end{frame}
+
+\begin{frame}
+    \frametitle{Coherence Protocol}
+
+
+
+\end{frame}
+
+\begin{frame}
+    \frametitle{Stateful Nodes}
+
+
+
+\end{frame}
+
+% Part 3: Progress
+% =============================================================================
+\section{3. Progress}
+
+\begin{frame}
+    \frametitle{Progress}
+
+
+
+\end{frame}
+
+\begin{frame}
+    \frametitle{On-demand Coherency in ARM64}
+    \begin{itemize}
+        \item {
+            ARMv8 defines two levels of cache coherence:
+            \begin{itemize}
+                \item {
+                    \textit{Point-of-Unification}:
+                }
+                \item {
+                    \textit{Point-of-Coherence}:
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Kernel Patch for On-demand Coherency}
+
+
+
+\end{frame}
+
+\begin{frame}
+    \frametitle{Proof-of-Concept Kernel Module}
+
+
+
+\end{frame}
+
+% References
+\end{document}