preparing for feb 1st

2024-01-29 21:19:03 +00:00 · 2024-01-29 21:19:03 +00:00 · 169382407d
commit 169382407d
parent 6c5b3626e1
2 changed files with 270 additions and 0 deletions
--- a/tex/misc/w15_slides.pdf
+++ b/tex/misc/w15_slides.pdf
--- a/tex/misc/w15_slides.tex
+++ b/tex/misc/w15_slides.tex
@ -0,0 +1,270 @@
 \documentclass{beamer}
 \usepackage[]{biblatex}
 \usepackage[export]{adjustbox}
 \title{
    Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM
 }
 \author{Zhengyi Chen}
 \date{\today}
 \addbibresource{../main.bib}
 \begin{document}
 % Title Page
 \frame{\titlepage}
 % Table of Content
 \begin{frame}
    \frametitle{Table of Contents}
    \tableofcontents
 \end{frame}
 % Part 1: Overview
 % =============================================================================
 \section{1. Overview}
 % Page 1
 \begin{frame}
    \frametitle{1. Overview}
    \begin{itemize}
        \item {
            DSM used to be constrained by NIC bandwidth \& transfer rate (e.g.,
            during the 1990s).
        }
        \item {
            The advent of high(er) transfer rate NICs allows the DSM idea to be
            revived.
        }
        \item {
            Orthogonally, hardware acceleration resources are scarce and highly
            valuable.
            \begin{itemize}
                \item {
                    Traditional Scheduling Mechanisms within a Cluster cannot
                    dynamically allocate hardware accelerators without high
                    overhead.
                }
            \end{itemize}
        }
        \item {
            Ideally, via high-speed NICs, hardware accelerator could be
            statically allocated such that:
            \begin{itemize}
                \item {
                    Every node have access to the hardware accelerator node in a
                    time-shared fashion.
                }
                \item {
                    Accelerator-attached node can access remote memory much like
                    attaching accelerator over, say, PCIe.
                }
            \end{itemize}
        }
    \end{itemize}
 \end{frame}
 \begin{frame}
    \frametitle{Heterogeneous Memory Management}
    \begin{itemize}
        \item {
            \textbf{HMM} facilitates shared address space and transparent data
            migration between CPU and peripherals. Specifically:
            \begin{itemize}
                \item {
                    HMM provides interface for duplicating the CPU page table
                    with that of the device's, which are transparently
                    synchronized.
                }
                \item {
                    It also provides corresponding \texttt{struct page}
                    representation of device memory pages, which are faulted
                    between the CPU and device.
                }
            \end{itemize}
        }
        \item {
            Theoretically, this should allow for devices in remote nodes to
            perform HMM using the DMA-capable NIC as a ``proxy HMM device''.
        }
        \item {
            Details of implementation of DSM-over-HMM is beyond this thesis's
            scope.
            \begin{itemize}
                \item {
                    This thesis focuses on studying and implementing cache
                    coherency and later, memory model for the DSM subsystem of
                    this wider, ongoing project.
                }
            \end{itemize}
        }
    \end{itemize}
 \end{frame}
 \begin{frame}
    \frametitle{Cache Coherency, and Why It Matters Here}
    \begin{itemize}
        \item {
            Cache-incoherent RDMA (e.g., mlx) performs DMA without
            synchronization with CPU cache.
        }
        \item {
            We cannot assume MMU to magically automatically maintain coherence.
        }
        \item {
            At transportation time:
            \begin{itemize}
                \item {
                    Send to remote: flushes cache into memory before posting
                    send message.
                }
                \item {
                    Receive from remote: invalidate cache entry after worked
                    recv message.
                }
            \end{itemize}
        }
        \item {
            Example: Linux kernel tree, \textit{smbdirect} implementation.
            \begin{itemize}
                \item {
                    \textit{smbdirect} opportunistically establish SMB over
                    RDMA-capable network.
                }
                \item {
                    \texttt{smbd\_post\_send} cleans cache entry prior to posting
                    send request.
                }
                \item {
                    \texttt{recv\_done} invalidates cache entry after exiting
                    softirq for recv request (as callback from RDMA driver).
                }
            \end{itemize}
        }
    \end{itemize}
 \end{frame}
 \begin{frame}
    \frametitle{Consistency Model and Protocol}
    \begin{itemize}
        \item {
            Majority of DSM literatures apply \textbf{release consistency} as
            the system's memory model.
        }
        \item {
            With \textbf{single-writer} protocol, however, the memory model can
            be strengthened with little increase in code complexity.
            \begin{itemize}
                \item {
                    \textit{DSPM}\cite{shan2017distributed}, for example,
                    achieves a \textit{de-facto} TSO consistency from its
                    multi-writer release consistency counterpart -- assuming
                    correct memory barriers within each node's CPU, distributed
                    writes are never reordered, and distributed reads can
                    overtake writes.
                }
                \item {
                    Consequently, one can easily achieve sequential consistency
                    by designating the entire write-access duration as a critical
                    section.
                }
            \end{itemize}
        }
        \item {
            HMM's ``CPU-or-device'' data migration model also strongly implies
            a single-writer consistency protocol.
        }
    \end{itemize}
 \end{frame}
 % Part 2: Design
 % =============================================================================
 \section{2. Design}
 \begin{frame}
    \frametitle{2. Design}
    \begin{itemize}
        \item {
            Designing a DSM necessitates designing:
            \begin{itemize}
                \item Consistency Model.
                \item Coherence Protocol and State Machine.
                \item Access Control.
            \end{itemize}
        }
        \item {
            Care needs to be taken to ensure that the in-kernel implementation
            is:
            \begin{itemize}
                \item Correct,
                \item Performant,
                \item Exploits RDMA's traits.
            \end{itemize}
        }
    \end{itemize}
 \end{frame}
 \begin{frame}
    \frametitle{Consistency Model}
 \end{frame}
 \begin{frame}
    \frametitle{Coherence Protocol}
 \end{frame}
 \begin{frame}
    \frametitle{Stateful Nodes}
 \end{frame}
 % Part 3: Progress
 % =============================================================================
 \section{3. Progress}
 \begin{frame}
    \frametitle{Progress}
 \end{frame}
 \begin{frame}
    \frametitle{On-demand Coherency in ARM64}
    \begin{itemize}
        \item {
            ARMv8 defines two levels of cache coherence:
            \begin{itemize}
                \item {
                    \textit{Point-of-Unification}:
                }
                \item {
                    \textit{Point-of-Coherence}:
                }
            \end{itemize}
        }
    \end{itemize}
 \end{frame}
 \begin{frame}
    \frametitle{Kernel Patch for On-demand Coherency}
 \end{frame}
 \begin{frame}
    \frametitle{Proof-of-Concept Kernel Module}
 \end{frame}
 % References
 \end{document}