Restored file from prev commit

2024-03-16 01:17:50 +00:00 · 2024-03-16 01:17:50 +00:00 · a2422135fc
commit a2422135fc
parent 9bef473315
17 changed files with 2686 additions and 0 deletions
--- a/tex/misc/w15_slides.tex
+++ b/tex/misc/w15_slides.tex
@ -0,0 +1,638 @@
+\documentclass{beamer}
+\usepackage[]{biblatex}
+\usepackage[export]{adjustbox}
+\usepackage{hyperref}
+
+\title{
+    Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM
+}
+\author{Zhengyi Chen}
+\date{\today}
+
+\addbibresource{../main.bib}
+
+\begin{document}
+% Title Page
+\frame{\titlepage}
+
+% Table of Content
+\begin{frame}
+    \frametitle{Table of Contents}
+    \tableofcontents
+\end{frame}
+
+% Part 1: Overview
+% =============================================================================
+\section{1. Overview}
+% Page 1
+\begin{frame}
+    \frametitle{1. Overview}
+    \begin{itemize}
+        \item {
+            DSM used to be constrained by NIC bandwidth \& transfer rate (e.g.,
+            during the 1990s).
+        }
+        \item {
+            The advent of high(er) transfer rate NICs allows the DSM idea to be
+            revived.
+        }
+        \item {
+            Orthogonally, hardware acceleration resources are scarce and highly
+            valuable.
+            \begin{itemize}
+                \item {
+                    Traditional Scheduling Mechanisms within a Cluster cannot
+                    dynamically allocate hardware accelerators without high
+                    overhead.
+                }
+            \end{itemize}
+        }
+        \item {
+            Ideally, via high-speed NICs, hardware accelerator could be
+            statically allocated such that:
+            \begin{itemize}
+                \item {
+                    Every node have access to the hardware accelerator node in a
+                    time-shared fashion.
+                }
+                \item {
+                    Accelerator-attached node can access remote memory much like
+                    attaching accelerator over, say, PCIe.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Heterogeneous Memory Management}
+    \begin{itemize}
+        \item {
+            \textbf{HMM} facilitates shared address space and transparent data
+            migration between CPU and peripherals. Specifically:
+            \begin{itemize}
+                \item {
+                    HMM provides interface for duplicating the CPU page table
+                    with that of the device's, which are transparently
+                    synchronized.
+                }
+                \item {
+                    It also provides corresponding \texttt{struct page}
+                    representation of device memory pages, which are faulted
+                    between the CPU and device.
+                }
+            \end{itemize}
+        }
+        \item {
+            Theoretically, this should allow for devices in remote nodes to
+            perform HMM using the DMA-capable NIC as a ``proxy HMM device''.
+        }
+        \item {
+            Details of implementation of DSM-over-HMM is beyond this thesis's
+            scope.
+            \begin{itemize}
+                \item {
+                    This thesis focuses on studying and implementing cache
+                    coherency and later, memory model for the DSM subsystem of
+                    this wider, ongoing project.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Cache Coherency, and Why It Matters Here}
+    \begin{itemize}
+        \item {
+            Cache-incoherent RDMA (e.g., mlx) performs DMA without
+            synchronization with CPU cache.
+        }
+        \item {
+            We cannot assume MMU to magically maintain coherence.
+            \begin{itemize}
+                \item {
+                    This seems the case for x86\_64 (cache-coherent DMA), but
+                    not ARM64.
+                }
+            \end{itemize}
+        }
+        \item {
+            At transportation time:
+            \begin{itemize}
+                \item {
+                    Send to remote: flushes cache into memory before posting
+                    send message.
+                }
+                \item {
+                    Receive from remote: invalidate cache entry after worked
+                    recv message.
+                }
+            \end{itemize}
+        }
+        \item {
+            Example: Linux kernel tree, \textit{smbdirect} implementation.
+            \begin{itemize}
+                \item {
+                    \textit{smbdirect} opportunistically establish SMB over
+                    RDMA-capable network.
+                }
+                \item {
+                    \texttt{smbd\_post\_send} cleans cache entry prior to posting
+                    send request.
+                }
+                \item {
+                    \texttt{recv\_done} invalidates cache entry after exiting
+                    softirq for recv request (as callback from RDMA driver).
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Consistency Model and Protocol}
+    \begin{itemize}
+        \item {
+            Majority of DSM literatures apply \textbf{release consistency} as
+            the system's memory model.
+        }
+        \item {
+            With \textbf{single-writer} protocol, however, the memory model can
+            be strengthened with little increase in code complexity.
+            \begin{itemize}
+                \item {
+                    \textit{DSPM}\cite{shan2017distributed}, for example,
+                    achieves a \textit{de-facto} TSO consistency from its
+                    multi-writer release consistency counterpart -- assuming
+                    correct memory barriers within each node's CPU, distributed
+                    writes are never reordered, and distributed reads can
+                    overtake writes.
+                }
+                \item {
+                    Consequently, one can easily achieve sequential consistency
+                    by designating the entire write-access duration as a critical
+                    section.
+                }
+            \end{itemize}
+        }
+        \item {
+            HMM's ``CPU-or-device'' data migration model also strongly implies
+            a single-writer consistency protocol.
+        }
+    \end{itemize}
+\end{frame}
+
+% Part 2: Design
+% =============================================================================
+\section{2. Design}
+
+\begin{frame}
+    \frametitle{2. Design}
+    \begin{itemize}
+        \item {
+            Designing a DSM necessitates designing:
+            \begin{itemize}
+                \item Consistency Model.
+                \item Coherence Protocol and State Machine.
+                \item Access Control.
+            \end{itemize}
+        }
+        \item {
+            Care needs to be taken to ensure that the in-kernel implementation
+            is:
+            \begin{itemize}
+                \item Correct,
+                \item Performant,
+                \item Exploits RDMA's traits.
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Protocol Overview}
+    \begin{itemize}
+        \item {
+            Multiple readers can exist for a clean page -- the page is
+            \textbf{shared}.
+        }
+        \item {
+            Only one writer is allowed for a clean page -- the page becomes
+            \textbf{exclusive}.
+        }
+        \item {
+            For one writer node be allowed sole write access to some page, all
+            other sharers need to have their page cache invalidated prior to
+            making the change global (commit-invalidate).
+        }
+        \item {
+            While the sole writer node has not yet committed, either:
+            \begin{itemize}
+                \item {
+                    no other reader or writer nodes are allowed to be served
+                    this page (stronger consistency model).
+                }
+                \item {
+                    no writers are allowed to be served this page. Readers can
+                    be served stale data (provided data providers do not receive
+                    invalidation message prior to service).
+                }
+            \end{itemize}
+        }
+        \item {
+            When the sole writer commits, it becomes the sole home node
+            (data provider) which serves the updated page content.
+            \begin{itemize}
+                \item {
+                    Optionally, some nodes can register to have commits written
+                    back instead.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Protocol Excerpt: Write-Invalidate}
+    \begin{figure}
+        \centering
+        \includegraphics[width=\linewidth]{
+            w12_slides_resources/Fig-RwlockProtocol 2023-12-06 19_05_06.pdf
+        }
+    \end{figure}
+    The \textit{T}-state indicates a transitionary state for some shared page.
+\end{frame}
+
+\begin{frame}
+    \frametitle{Consistency Model: TSO}
+    \begin{itemize}
+        \item {
+            Total Store Ordering allows Reads to overtake Stores.
+        }
+        \item {
+            Assuming correct use of node-local synchronization on all nodes,
+            applying TSO in a home-based DSM allows for:
+            \begin{itemize}
+                \item {
+                    Another node tries to read T-page from access-control
+                    node, served stale data: W$\rightarrow$R violation.
+                }
+                \item {
+                    Another node tries to read S-page from data-provider
+                    nodes, served stale data: W$\rightarrow$R violation
+                    (if e.g., the invalidation message from access-control node
+                    was received afterwards).
+                }
+                \item {
+                    Data-provider and access-control nodes work on one request
+                    at a time: no R$\rightarrow$W violation.
+                }
+                \item {
+                    Write-accesses serialized at access-control node: no
+                    W$\rightarrow$W violation.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Consistency Model: Strengthen to Sequential}
+    \begin{itemize}
+        \item {
+            By corollary, can reverse the previous page's statements to
+            strengthen to sequential consistency:
+            \begin{itemize}
+                \item {
+                    Disallow T-pages from being serviced until new page content
+                    is installed: lengthens critical section.
+                }
+                \item {
+                    Abolish data-provider nodes: access-control nodes become
+                    bottleneck.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Coherence Protocol: Possible Features}
+    \begin{itemize}
+        \item {
+            Multi-data-provider Protocol: Instead of having one data-provider,
+            have multiple data-provider nodes that are automatically write-back
+            to prevent network bottleneck.
+            \begin{itemize}
+                \item Data provider nodes may be dynamically assigned.
+                \item Extra metadata can limit scalability.
+            \end{itemize}
+        }
+        \item {
+            Auto-share: likewise, write-back pages to non-data-provider nodes,
+            which takes advantage of 1-sided communications provided by RDMA.
+        }
+        \item {
+            Request aggregation: aggregate RDMA transfers for optimal transfer
+            performance.
+            \begin{itemize}
+                \item Need to be coherent with program sequence!
+                \item Enables write-request merging.
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Stateful Nodes \& Transitions (Provisional)}
+    \begin{itemize}
+        \item {
+            Nodes (e.g., within the cluster) become tightly bound with the
+            properties of each shared page(s).
+        }
+    \end{itemize}
+    \begin{figure}
+        \centering
+        \includegraphics[width=\linewidth]{
+            w15_resources/截屏 2024-01-30 19.15.45 2024-01-30 19_16_19.png
+        }
+    \end{figure}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Stateful Nodes \& Transitions (Provisional) (Cont.)}
+    \begin{itemize}
+        \item {
+            MN (Manager Nodes): Provide access-control and (fallback)
+            data-provision.
+        }
+        \item {
+            HN (Home Nodes): Provide data-provision. Can be write-back or
+            write-invalidate.
+        }
+        \item {
+            SN (Sharer Nodes): Share data within a reader-only ``epoch''. Can be
+            write-back or write-invalidate.
+        }
+        \item {
+            NSN (Non-sharer Nodes): Nodes in network without sharing the
+            particular page(s).
+        }
+        \item {
+            CN (Commit Node): Node that acquired the single-writer access to the
+            shared page.
+        }
+        \item {
+            Problem: Message variants are not finalized:
+            \begin{itemize}
+                \item {
+                    Goal: Composable message chains that allow for
+                    ``piggy-backing'' of multiple procedures.
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Stateful Nodes: Transition Paths}
+    \begin{itemize}
+        \item {
+            Filled line transitions indicate local requests remote to perform
+            state transition.
+        }
+        \item {
+            Dashed line transitions indicate local implicitly transitions prior
+            to sending request to remote.
+        }
+        \item {
+            \textit{Non-committal} path concerns about read-only and
+            copy-on-write sharing. Sharers cannot make global modification to
+            cached local data.
+        }
+        \item {
+            \textit{Invalidation} path is duo with commit operations (due to
+            write-invalidation).
+        }
+        \item {
+            \textit{Committal} path concerns about global write sharing. Only
+            one writer is allowed to write and commit at one time.
+        }
+        \item {
+            Problem: How exactly to integrate RDMA remote read/write into this?
+        }
+    \end{itemize}
+\end{frame}
+
+% Part 3: Progress
+% =============================================================================
+\section{3. Progress}
+
+\begin{frame}
+    \frametitle{3. Progress}
+    \begin{itemize}
+        \item {
+            Goal: in-kernel implementation of software cache-coherency via
+            non-coherent RDMA hardware.
+        }
+        \item {
+            Optimistic Goal: in-kernel implementation of memory model in DSM.
+        }
+        \item {
+            Progress: studied and isolated mechanism for data cache
+            invalidation/flushing in ARM64, which allows the DSM to run in
+            heterogeneous ISA clusters.
+        }
+        \item {
+            Integration with kernel \& main DSM kernel module remains at hand:
+            is it absolutely necessary to export new symbols for such an
+            important operation?
+        }
+        \item {
+            Repository: \url{https://github.com/rubberhead/unnamed_ba_thesis.git}.
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{On-demand Coherency in ARM64}
+    \begin{itemize}
+        \item {
+            ARMv8 defines two levels of cache coherence:
+            \begin{itemize}
+                \item {
+                    \textit{Point-of-Unification}: Within a core, instruction
+                    cache, data cache, and TLB all agree in the copy seen for a
+                    particular address.
+                    \begin{itemize}
+                        \item Notably, changing PTE requires PoU.
+                    \end{itemize}
+                }
+                \item {
+                    \textit{Point-of-Coherence}: Between all DMA-capable
+                    peripherals (CPU or otherwise), they all agree in the copy
+                    seen for a particular address.
+                }
+            \end{itemize}
+            For this thesis's purposes, strive for PoC.
+        }
+        \item {
+            Operations to achieve the latter are encapsulated in the Linux
+            kernel as \texttt{(d|i)cache\_(clean|inval)\_poc}.
+            \begin{itemize}
+                \item Declared under \texttt{arch/arm64/include/asm/cacheflush.h}.
+                \item Defined in \texttt{arch/arm64/mm/cache.S}.
+                \item {
+                    Takes virtual address wrt. \textit{current} address space to
+                    writeback/invalidate cache entries.
+                }
+                \item {
+                    Problem: Can only be called in process context (for user
+                    virtual addresses) or in all contexts (for kernel virtual
+                    addresses)?
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Kernel Patch for On-demand Coherency}
+    \begin{itemize}
+        \item {
+            Problem: These symbols are not exported -- not intended for driver
+            use.
+        }
+        \item {
+            Temporary solution: re-export them via patching the kernel.
+            \begin{itemize}
+                \item Note: Kernel version v6.7.0
+                \item {
+                    Longish-term solution: arrange kernel module code in a way
+                    that takes advantage of existing driver API
+                    (e.g., via DMA API, which for example \textit{smbdirect}
+                    uses).
+                }
+            \end{itemize}
+        }
+        \item {
+            Implements wrapper function \texttt{\_\_dcache\_clean\_poc} to
+            re-export \texttt{dcache\_clean\_poc} into driver namespace.
+        }
+        \item {
+            Exports symbol into separate header file.
+            \begin{itemize}
+                \item {
+                    Declared in
+                    \texttt{arch/arm64/include/asm/cacheflush\_extra.h}.
+                }
+                \item Defined in \texttt{arch/arm64/mm/flush.c}.
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+\begin{frame}
+    \frametitle{Proof-of-Concept Kernel Module}
+    \begin{itemize}
+        \item {
+            Dynamically allocates \texttt{GFP\_USER} pages and remaps to
+            userspace on \texttt{mmap}.
+            \begin{itemize}
+                \item {
+                    \texttt{GFP\_USER} so (for convenience) pages can be
+                    directly addressable in kernelspace (via kernel page table).
+                }
+                \item {
+                    Pages are lazily allocated and shared between multiple
+                    processes (i.e., user address spaces).
+                }
+                \item {
+                    Exposed as character device \texttt{/dev/my\_shmem}.
+                }
+            \end{itemize}
+        }
+        \item Around 300+ LoC.
+        \item {
+            Problem: flawed premise for testing cache writeback!
+            \begin{itemize}
+                \item {
+                    Summary: CPU datapath differs from DMA datapath, common cache
+                    coherency maintenance operations are already performed
+                    in common file/virtual memory area operation code.
+                }
+                \item {
+                    Idea: perform cache write-back on \texttt{vm\_ops->close}.
+                }
+                \item {
+                    Reality: virtual memory area already cleaned from cache and
+                    removed from address space prior to calling
+                    \texttt{vm\_ops->close}.
+                }
+                \item {
+                    Fix: Implement custom \texttt{ioctl}?
+                }
+            \end{itemize}
+        }
+    \end{itemize}
+\end{frame}
+
+% Part 4: Future Work
+% =============================================================================
+\section{4. Future Work}
+
+\begin{frame}
+    \frametitle{4. Future Work}
+    \begin{itemize}
+        \item {
+            TBD:
+            \begin{enumerate}
+                \item {
+                    Incorporate cache coherence mechanism into the larger
+                    project.
+                }
+                \item {
+                    Implement memory model within the larger project. This
+                    involves:
+                    \begin{itemize}
+                        \item {
+                            Making adjustment to message type and structure
+                            specifications for better inter-operation with RDMA.
+                        }
+                        \item {
+                            Implement memory model programmatically.
+                        }
+                    \end{itemize}
+                }
+            \end{enumerate}
+        }
+        \item {
+            Further Studies:
+            \begin{enumerate}
+                \item {
+                    Swappable RDMA memory region.
+                    \begin{itemize}
+                        \item {
+                            As of now, all DMA pages are non-swappable -- they
+                            must be allocated using the SLAB/SLUB allocator for
+                            kernel memory, or via GFP page allocators.
+                        }
+                    \end{itemize}
+                }
+                \item {
+                    Automatic frequent sharer detection for MUX-ing between
+                    commit-invalidation and commit-writeback.
+                }
+            \end{enumerate}
+        }
+    \end{itemize}
+\end{frame}
+
+% References
+\begin{frame}
+    \frametitle{References}
+    \printbibliography
+\end{frame}
+
+\end{document}