unnamed_ba_thesis/tex/misc/w15_slides.tex

\documentclass{beamer}
\usepackage[]{biblatex}
\usepackage[export]{adjustbox}

\title{
    Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM
}
\author{Zhengyi Chen}
\date{\today}

\addbibresource{../main.bib}

\begin{document}
% Title Page
\frame{\titlepage}

% Table of Content
\begin{frame}
    \frametitle{Table of Contents}
    \tableofcontents
\end{frame}

% Part 1: Overview
% =============================================================================
\section{1. Overview}
% Page 1
\begin{frame}
    \frametitle{1. Overview}
    \begin{itemize}
        \item {
            DSM used to be constrained by NIC bandwidth \& transfer rate (e.g.,
            during the 1990s).
        }
        \item {
            The advent of high(er) transfer rate NICs allows the DSM idea to be
            revived.
        }
        \item {
            Orthogonally, hardware acceleration resources are scarce and highly
            valuable.
            \begin{itemize}
                \item {
                    Traditional Scheduling Mechanisms within a Cluster cannot
                    dynamically allocate hardware accelerators without high
                    overhead.
                }
            \end{itemize}
        }
        \item {
            Ideally, via high-speed NICs, hardware accelerator could be
            statically allocated such that:
            \begin{itemize}
                \item {
                    Every node have access to the hardware accelerator node in a
                    time-shared fashion.
                }
                \item {
                    Accelerator-attached node can access remote memory much like
                    attaching accelerator over, say, PCIe.
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Heterogeneous Memory Management}
    \begin{itemize}
        \item {
            \textbf{HMM} facilitates shared address space and transparent data
            migration between CPU and peripherals. Specifically:
            \begin{itemize}
                \item {
                    HMM provides interface for duplicating the CPU page table
                    with that of the device's, which are transparently
                    synchronized.
                }
                \item {
                    It also provides corresponding \texttt{struct page}
                    representation of device memory pages, which are faulted
                    between the CPU and device.
                }
            \end{itemize}
        }
        \item {
            Theoretically, this should allow for devices in remote nodes to
            perform HMM using the DMA-capable NIC as a ``proxy HMM device''.
        }
        \item {
            Details of implementation of DSM-over-HMM is beyond this thesis's
            scope.
            \begin{itemize}
                \item {
                    This thesis focuses on studying and implementing cache
                    coherency and later, memory model for the DSM subsystem of
                    this wider, ongoing project.
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Cache Coherency, and Why It Matters Here}
    \begin{itemize}
        \item {
            Cache-incoherent RDMA (e.g., mlx) performs DMA without
            synchronization with CPU cache.
        }
        \item {
            We cannot assume MMU to magically maintain coherence.
            \begin{itemize}
                \item {
                    This seems the case for x86\_64 (cache-coherent DMA), but
                    not ARM64.
                }
            \end{itemize}
        }
        \item {
            At transportation time:
            \begin{itemize}
                \item {
                    Send to remote: flushes cache into memory before posting
                    send message.
                }
                \item {
                    Receive from remote: invalidate cache entry after worked
                    recv message.
                }
            \end{itemize}
        }
        \item {
            Example: Linux kernel tree, \textit{smbdirect} implementation.
            \begin{itemize}
                \item {
                    \textit{smbdirect} opportunistically establish SMB over
                    RDMA-capable network.
                }
                \item {
                    \texttt{smbd\_post\_send} cleans cache entry prior to posting
                    send request.
                }
                \item {
                    \texttt{recv\_done} invalidates cache entry after exiting
                    softirq for recv request (as callback from RDMA driver).
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Consistency Model and Protocol}
    \begin{itemize}
        \item {
            Majority of DSM literatures apply \textbf{release consistency} as
            the system's memory model.
        }
        \item {
            With \textbf{single-writer} protocol, however, the memory model can
            be strengthened with little increase in code complexity.
            \begin{itemize}
                \item {
                    \textit{DSPM}\cite{shan2017distributed}, for example,
                    achieves a \textit{de-facto} TSO consistency from its
                    multi-writer release consistency counterpart -- assuming
                    correct memory barriers within each node's CPU, distributed
                    writes are never reordered, and distributed reads can
                    overtake writes.
                }
                \item {
                    Consequently, one can easily achieve sequential consistency
                    by designating the entire write-access duration as a critical
                    section.
                }
            \end{itemize}
        }
        \item {
            HMM's ``CPU-or-device'' data migration model also strongly implies
            a single-writer consistency protocol.
        }
    \end{itemize}
\end{frame}

% Part 2: Design
% =============================================================================
\section{2. Design}

\begin{frame}
    \frametitle{2. Design}
    \begin{itemize}
        \item {
            Designing a DSM necessitates designing:
            \begin{itemize}
                \item Consistency Model.
                \item Coherence Protocol and State Machine.
                \item Access Control.
            \end{itemize}
        }
        \item {
            Care needs to be taken to ensure that the in-kernel implementation
            is:
            \begin{itemize}
                \item Correct,
                \item Performant,
                \item Exploits RDMA's traits.
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Protocol Excerpt: Write-Invalidate}
    \begin{figure}
        \centering
        \includegraphics[width=\linewidth]{
            w12_slides_resources/Fig-RwlockProtocol 2023-12-06 19_05_06.pdf
        }
    \end{figure}
    The \textit{T}-state indicates a transitionary state for some shared page.
\end{frame}

\begin{frame}
    \frametitle{Consistency Model: TSO}
    \begin{itemize}
        \item {
            Total Store Ordering allows Reads to bypass Stores.
        }
        \item {
            Assuming correct use of node-local synchronization on all nodes,
            applying TSO in a home-based DSM allows for:
            \begin{itemize}
                \item {
                    When another node tries to read T-page from access-control
                    node: W$\rightarrow$R violation.
                }
                \item {
                    When another node tries to read S-page from data-provider
                    nodes: W$\rightarrow$R violation (if e.g., the invalidation
                    message from access-control node was received afterwards).
                }
                \item {
                    Data-provider and access-control nodes work on one request
                    at a time: no R$\rightarrow$W violation.
                }
                \item {
                    Write-accesses serialized at access-control node: no
                    W$\rightarrow$W violation.
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Consistency Model: Strengthen to Sequential}
    \begin{itemize}
        \item {
            By corollary, can reverse the previous page's statements to
            strengthen to sequential consistency:
            \begin{itemize}
                \item {
                    Disallow T-pages from being serviced until new page content
                    is installed: lengthens critical section.
                }
                \item {
                    Abolish data-provider nodes: access-control nodes become
                    bottleneck.
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Coherence Protocol: Possible Features}
    \begin{itemize}
        \item {
            Multi-data-provider Protocol: Instead of having one data-provider,
            have multiple data-provider nodes that are automatically write-back
            to prevent network bottleneck.
            \begin{itemize}
                \item Data provider nodes may be dynamically assigned.
                \item Extra metadata can limit scalability.
            \end{itemize}
        }
        \item {
            Auto-share: likewise, write-back pages to non-data-provider nodes to
            take advantage of 1-sided communications.
        }
        \item {
            Request aggregation: aggregate RDMA transfers for optimal transfer
            performance.
            \begin{itemize}
                \item Need to be coherent with program sequence!
                \item Enables write-request merging.
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Stateful Nodes \& Transitions (Provisional)}
    \begin{itemize}
        \item {
            Nodes (e.g., within the cluster) become tightly bound with the
            properties of each shared page(s).
        }
    \end{itemize}
    \begin{figure}
        \centering
        \includegraphics[width=\linewidth]{
            w15_resources/截屏 2024-01-30 19.15.45 2024-01-30 19_16_19.png
        }
    \end{figure}
\end{frame}

\begin{frame}
    \frametitle{Stateful Nodes \& Transitions (Provisional) (Cont.)}
    \begin{itemize}
        \item {
            MN (Manager Nodes): Provide access-control and (fallback)
            data-provision.
        }
        \item {
            HN (Home Nodes): Provide data-provision. Can be write-back or
            write-invalidate.
        }
        \item {
            SN (Sharer Nodes): Share data within a reader-only ``epoch''. Can be
            write-back or write-invalidate.
        }
        \item {
            NSN (Non-sharer Nodes): Nodes in network without sharing the
            particular page(s).
        }
        \item {
            CN (Commit Node): Node that acquired the single-writer access to the
            shared page.
        }
        \item {
            Message variants are not finalized:
            \begin{itemize}
                \item {
                    Goal: Composable message chains that allow for
                    ``piggy-backing'' of multiple procedures.
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Stateful Nodes: Transition Paths}
    \begin{itemize}
        \item {
            Filled line transitions indicate local requests remote to perform
            state transition.
        }
        \item {
            Dashed line transitions indicate local implicitly transitions prior
            to sending request to remote.
        }
        \item {
            \textit{Non-committal} path concerns about read-only and
            copy-on-write sharing. Sharers cannot make global modification to
            cached local data.
        }
        \item {
            \textit{Invalidation} path is duo with commit operations (due to
            write-invalidation).
        }
        \item {
            \textit{Committal} path concerns about global write sharing. Only
            one writer is allowed to write and commit at one time.
        }
        \item {
            Problem: How exactly to integrate RDMA remote read/write into this?
        }
    \end{itemize}
\end{frame}

% Part 3: Progress
% =============================================================================
\section{3. Progress}

\begin{frame}
    \frametitle{3. Progress}
    \begin{itemize}
        \item {
            Goal: in-kernel implementation of software cache-coherency via
            non-coherent RDMA hardware.
        }
        \item {
            Optimistic Goal: in-kernel implementation of memory model in DSM.
        }
        \item {
            Progress: studied and isolated mechanism for data cache
            invalidation/flushing in ARM64, which allows the DSM to run in
            heterogeneous ISA clusters.
        }
        \item {
            Integration with kernel \& main DSM kernel module remains at hand:
            is it absolutely necessary to export new symbols for such an
            important operation?
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{On-demand Coherency in ARM64}
    \begin{itemize}
        \item {
            ARMv8 defines two levels of cache coherence:
            \begin{itemize}
                \item {
                    \textit{Point-of-Unification}: Within a core, instruction
                    cache, data cache, and TLB all agree in the copy seen for a
                    particular address.
                    \begin{itemize}
                        \item Notably, changing PTE requires PoU.
                    \end{itemize}
                }
                \item {
                    \textit{Point-of-Coherence}: Between all DMA-capable
                    peripherals (CPU or otherwise), they all agree in the copy
                    seen for a particular address.
                }
            \end{itemize}
            For this thesis's purposes, strive for PoC.
        }
        \item {
            Operations to achieve the latter are encapsulated in the Linux
            kernel as \texttt{(d|i)cache\_(clean|inval)\_poc}.
            \begin{itemize}
                \item Declared under \texttt{arch/arm64/include/asm/cacheflush.h}.
                \item Defined in \texttt{arch/arm64/mm/cache.S}.
                \item {
                    Takes virtual address wrt. \textit{current} address space to
                    writeback/invalidate cache entries.
                }
                \item {
                    Problem: Can only be called in process context (for userspace
                    virtual addresses) or in all contexts
                    (for kernel virtual addresses)?
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Kernel Patch for On-demand Coherency}
    \begin{itemize}
        \item {
            Problem: These symbols are not exported -- not intended for driver
            use.
        }
        \item {
            Temporary solution: re-export them via patching the kernel.
            \begin{itemize}
                \item Note: Kernel version v6.7.0
                \item {
                    Longish-term solution: arrange kernel module code in a way
                    that takes advantage of existing driver API
                    (e.g., via DMA API, for example \textit{smbdirect}).
                }
            \end{itemize}
        }
        \item {
            Implements wrapper function \texttt{\_\_dcache\_clean\_poc} to
            re-export \texttt{dcache\_clean\_poc} into driver namespace.
        }
        \item {
            Exports symbol into separate header file.
        }
    \end{itemize}
\end{frame}

\begin{frame}
    \frametitle{Proof-of-Concept Kernel Module}
    \begin{itemize}
        \item {
            Dynamically allocates \texttt{GFP\_USER} pages and remaps to
            userspace on \texttt{mmap}.
            \begin{itemize}
                \item {
                    \texttt{GFP\_USER} so (for convenience) pages can be
                    directly addressable in kernelspace (via kernel page table).
                }
                \item {
                    Pages are lazily allocated and shared between multiple
                    processes (i.e., user address spaces).
                }
                \item {
                    Exposed as character device \texttt{/dev/my\_shmem}.
                }
            \end{itemize}
        }
        \item Around 300+ LoC.
        \item {
            Problem: flawed premise for testing cache writeback!
            \begin{itemize}
                \item {
                    Summary: CPU datapath differs from DMA datapath, common cache
                    coherency maintenance operations are already performed
                    in common file/virtual memory area operation code.
                }
                \item {
                    Idea: perform cache write-back on \texttt{vm\_ops->close}.
                }
                \item {
                    Reality: virtual memory area already cleaned from cache and
                    removed from address space prior to calling
                    \texttt{vm\_ops->close}.
                }
                \item {
                    Fix: Implement custom \texttt{ioctl}?
                }
            \end{itemize}
        }
    \end{itemize}
\end{frame}

% Part 4: Future Work
% =============================================================================
\section{4. Future Work}

\begin{frame}
    \frametitle{4. Future Work}
    \begin{enumerate}
        \item {
            Incorporate cache coherence mechanism into the larger project.
        }
        \item {
            Implement memory model within the larger project. This involves:
            \begin{itemize}
                \item {
                    Making adjustment to message type and structure specifications
                    for better inter-operation with RDMA.
                }
                \item {
                    Implement memory model programmatically.
                }
            \end{itemize}
        }
    \end{enumerate}
\end{frame}

% References
\begin{frame}
    \frametitle{References}
    \printbibliography
\end{frame}

\end{document}