unnamed_ba_thesis/tex/misc/w15_slides.tex

556 lines
No EOL
18 KiB
TeX

\documentclass{beamer}
\usepackage[]{biblatex}
\usepackage[export]{adjustbox}
\title{
Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM
}
\author{Zhengyi Chen}
\date{\today}
\addbibresource{../main.bib}
\begin{document}
% Title Page
\frame{\titlepage}
% Table of Content
\begin{frame}
\frametitle{Table of Contents}
\tableofcontents
\end{frame}
% Part 1: Overview
% =============================================================================
\section{1. Overview}
% Page 1
\begin{frame}
\frametitle{1. Overview}
\begin{itemize}
\item {
DSM used to be constrained by NIC bandwidth \& transfer rate (e.g.,
during the 1990s).
}
\item {
The advent of high(er) transfer rate NICs allows the DSM idea to be
revived.
}
\item {
Orthogonally, hardware acceleration resources are scarce and highly
valuable.
\begin{itemize}
\item {
Traditional Scheduling Mechanisms within a Cluster cannot
dynamically allocate hardware accelerators without high
overhead.
}
\end{itemize}
}
\item {
Ideally, via high-speed NICs, hardware accelerator could be
statically allocated such that:
\begin{itemize}
\item {
Every node have access to the hardware accelerator node in a
time-shared fashion.
}
\item {
Accelerator-attached node can access remote memory much like
attaching accelerator over, say, PCIe.
}
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Heterogeneous Memory Management}
\begin{itemize}
\item {
\textbf{HMM} facilitates shared address space and transparent data
migration between CPU and peripherals. Specifically:
\begin{itemize}
\item {
HMM provides interface for duplicating the CPU page table
with that of the device's, which are transparently
synchronized.
}
\item {
It also provides corresponding \texttt{struct page}
representation of device memory pages, which are faulted
between the CPU and device.
}
\end{itemize}
}
\item {
Theoretically, this should allow for devices in remote nodes to
perform HMM using the DMA-capable NIC as a ``proxy HMM device''.
}
\item {
Details of implementation of DSM-over-HMM is beyond this thesis's
scope.
\begin{itemize}
\item {
This thesis focuses on studying and implementing cache
coherency and later, memory model for the DSM subsystem of
this wider, ongoing project.
}
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Cache Coherency, and Why It Matters Here}
\begin{itemize}
\item {
Cache-incoherent RDMA (e.g., mlx) performs DMA without
synchronization with CPU cache.
}
\item {
We cannot assume MMU to magically maintain coherence.
\begin{itemize}
\item {
This seems the case for x86\_64 (cache-coherent DMA), but
not ARM64.
}
\end{itemize}
}
\item {
At transportation time:
\begin{itemize}
\item {
Send to remote: flushes cache into memory before posting
send message.
}
\item {
Receive from remote: invalidate cache entry after worked
recv message.
}
\end{itemize}
}
\item {
Example: Linux kernel tree, \textit{smbdirect} implementation.
\begin{itemize}
\item {
\textit{smbdirect} opportunistically establish SMB over
RDMA-capable network.
}
\item {
\texttt{smbd\_post\_send} cleans cache entry prior to posting
send request.
}
\item {
\texttt{recv\_done} invalidates cache entry after exiting
softirq for recv request (as callback from RDMA driver).
}
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Consistency Model and Protocol}
\begin{itemize}
\item {
Majority of DSM literatures apply \textbf{release consistency} as
the system's memory model.
}
\item {
With \textbf{single-writer} protocol, however, the memory model can
be strengthened with little increase in code complexity.
\begin{itemize}
\item {
\textit{DSPM}\cite{shan2017distributed}, for example,
achieves a \textit{de-facto} TSO consistency from its
multi-writer release consistency counterpart -- assuming
correct memory barriers within each node's CPU, distributed
writes are never reordered, and distributed reads can
overtake writes.
}
\item {
Consequently, one can easily achieve sequential consistency
by designating the entire write-access duration as a critical
section.
}
\end{itemize}
}
\item {
HMM's ``CPU-or-device'' data migration model also strongly implies
a single-writer consistency protocol.
}
\end{itemize}
\end{frame}
% Part 2: Design
% =============================================================================
\section{2. Design}
\begin{frame}
\frametitle{2. Design}
\begin{itemize}
\item {
Designing a DSM necessitates designing:
\begin{itemize}
\item Consistency Model.
\item Coherence Protocol and State Machine.
\item Access Control.
\end{itemize}
}
\item {
Care needs to be taken to ensure that the in-kernel implementation
is:
\begin{itemize}
\item Correct,
\item Performant,
\item Exploits RDMA's traits.
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Protocol Excerpt: Write-Invalidate}
\begin{figure}
\centering
\includegraphics[width=\linewidth]{
w12_slides_resources/Fig-RwlockProtocol 2023-12-06 19_05_06.pdf
}
\end{figure}
The \textit{T}-state indicates a transitionary state for some shared page.
\end{frame}
\begin{frame}
\frametitle{Consistency Model: TSO}
\begin{itemize}
\item {
Total Store Ordering allows Reads to bypass Stores.
}
\item {
Assuming correct use of node-local synchronization on all nodes,
applying TSO in a home-based DSM allows for:
\begin{itemize}
\item {
When another node tries to read T-page from access-control
node: W$\rightarrow$R violation.
}
\item {
When another node tries to read S-page from data-provider
nodes: W$\rightarrow$R violation (if e.g., the invalidation
message from access-control node was received afterwards).
}
\item {
Data-provider and access-control nodes work on one request
at a time: no R$\rightarrow$W violation.
}
\item {
Write-accesses serialized at access-control node: no
W$\rightarrow$W violation.
}
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Consistency Model: Strengthen to Sequential}
\begin{itemize}
\item {
By corollary, can reverse the previous page's statements to
strengthen to sequential consistency:
\begin{itemize}
\item {
Disallow T-pages from being serviced until new page content
is installed: lengthens critical section.
}
\item {
Abolish data-provider nodes: access-control nodes become
bottleneck.
}
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Coherence Protocol: Possible Features}
\begin{itemize}
\item {
Multi-data-provider Protocol: Instead of having one data-provider,
have multiple data-provider nodes that are automatically write-back
to prevent network bottleneck.
\begin{itemize}
\item Data provider nodes may be dynamically assigned.
\item Extra metadata can limit scalability.
\end{itemize}
}
\item {
Auto-share: likewise, write-back pages to non-data-provider nodes to
take advantage of 1-sided communications.
}
\item {
Request aggregation: aggregate RDMA transfers for optimal transfer
performance.
\begin{itemize}
\item Need to be coherent with program sequence!
\item Enables write-request merging.
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Stateful Nodes \& Transitions (Provisional)}
\begin{itemize}
\item {
Nodes (e.g., within the cluster) become tightly bound with the
properties of each shared page(s).
}
\end{itemize}
\begin{figure}
\centering
\includegraphics[width=\linewidth]{
w15_resources/截屏 2024-01-30 19.15.45 2024-01-30 19_16_19.png
}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Stateful Nodes \& Transitions (Provisional) (Cont.)}
\begin{itemize}
\item {
MN (Manager Nodes): Provide access-control and (fallback)
data-provision.
}
\item {
HN (Home Nodes): Provide data-provision. Can be write-back or
write-invalidate.
}
\item {
SN (Sharer Nodes): Share data within a reader-only ``epoch''. Can be
write-back or write-invalidate.
}
\item {
NSN (Non-sharer Nodes): Nodes in network without sharing the
particular page(s).
}
\item {
CN (Commit Node): Node that acquired the single-writer access to the
shared page.
}
\item {
Message variants are not finalized:
\begin{itemize}
\item {
Goal: Composable message chains that allow for
``piggy-backing'' of multiple procedures.
}
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Stateful Nodes: Transition Paths}
\begin{itemize}
\item {
Filled line transitions indicate local requests remote to perform
state transition.
}
\item {
Dashed line transitions indicate local implicitly transitions prior
to sending request to remote.
}
\item {
\textit{Non-committal} path concerns about read-only and
copy-on-write sharing. Sharers cannot make global modification to
cached local data.
}
\item {
\textit{Invalidation} path is duo with commit operations (due to
write-invalidation).
}
\item {
\textit{Committal} path concerns about global write sharing. Only
one writer is allowed to write and commit at one time.
}
\item {
Problem: How exactly to integrate RDMA remote read/write into this?
}
\end{itemize}
\end{frame}
% Part 3: Progress
% =============================================================================
\section{3. Progress}
\begin{frame}
\frametitle{3. Progress}
\begin{itemize}
\item {
Goal: in-kernel implementation of software cache-coherency via
non-coherent RDMA hardware.
}
\item {
Optimistic Goal: in-kernel implementation of memory model in DSM.
}
\item {
Progress: studied and isolated mechanism for data cache
invalidation/flushing in ARM64, which allows the DSM to run in
heterogeneous ISA clusters.
}
\item {
Integration with kernel \& main DSM kernel module remains at hand:
is it absolutely necessary to export new symbols for such an
important operation?
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{On-demand Coherency in ARM64}
\begin{itemize}
\item {
ARMv8 defines two levels of cache coherence:
\begin{itemize}
\item {
\textit{Point-of-Unification}: Within a core, instruction
cache, data cache, and TLB all agree in the copy seen for a
particular address.
\begin{itemize}
\item Notably, changing PTE requires PoU.
\end{itemize}
}
\item {
\textit{Point-of-Coherence}: Between all DMA-capable
peripherals (CPU or otherwise), they all agree in the copy
seen for a particular address.
}
\end{itemize}
For this thesis's purposes, strive for PoC.
}
\item {
Operations to achieve the latter are encapsulated in the Linux
kernel as \texttt{(d|i)cache\_(clean|inval)\_poc}.
\begin{itemize}
\item Declared under \texttt{arch/arm64/include/asm/cacheflush.h}.
\item Defined in \texttt{arch/arm64/mm/cache.S}.
\item {
Takes virtual address wrt. \textit{current} address space to
writeback/invalidate cache entries.
}
\item {
Problem: Can only be called in process context (for userspace
virtual addresses) or in all contexts
(for kernel virtual addresses)?
}
\end{itemize}
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Kernel Patch for On-demand Coherency}
\begin{itemize}
\item {
Problem: These symbols are not exported -- not intended for driver
use.
}
\item {
Temporary solution: re-export them via patching the kernel.
\begin{itemize}
\item Note: Kernel version v6.7.0
\item {
Longish-term solution: arrange kernel module code in a way
that takes advantage of existing driver API
(e.g., via DMA API, for example \textit{smbdirect}).
}
\end{itemize}
}
\item {
Implements wrapper function \texttt{\_\_dcache\_clean\_poc} to
re-export \texttt{dcache\_clean\_poc} into driver namespace.
}
\item {
Exports symbol into separate header file.
}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Proof-of-Concept Kernel Module}
\begin{itemize}
\item {
Dynamically allocates \texttt{GFP\_USER} pages and remaps to
userspace on \texttt{mmap}.
\begin{itemize}
\item {
\texttt{GFP\_USER} so (for convenience) pages can be
directly addressable in kernelspace (via kernel page table).
}
\item {
Pages are lazily allocated and shared between multiple
processes (i.e., user address spaces).
}
\item {
Exposed as character device \texttt{/dev/my\_shmem}.
}
\end{itemize}
}
\item Around 300+ LoC.
\item {
Problem: flawed premise for testing cache writeback!
\begin{itemize}
\item {
Summary: CPU datapath differs from DMA datapath, common cache
coherency maintenance operations are already performed
in common file/virtual memory area operation code.
}
\item {
Idea: perform cache write-back on \texttt{vm\_ops->close}.
}
\item {
Reality: virtual memory area already cleaned from cache and
removed from address space prior to calling
\texttt{vm\_ops->close}.
}
\item {
Fix: Implement custom \texttt{ioctl}?
}
\end{itemize}
}
\end{itemize}
\end{frame}
% Part 4: Future Work
% =============================================================================
\section{4. Future Work}
\begin{frame}
\frametitle{4. Future Work}
\begin{enumerate}
\item {
Incorporate cache coherence mechanism into the larger project.
}
\item {
Implement memory model within the larger project. This involves:
\begin{itemize}
\item {
Making adjustment to message type and structure specifications
for better inter-operation with RDMA.
}
\item {
Implement memory model programmatically.
}
\end{itemize}
}
\end{enumerate}
\end{frame}
% References
\begin{frame}
\frametitle{References}
\printbibliography
\end{frame}
\end{document}