preparing for feb 1st
This commit is contained in:
parent
6c5b3626e1
commit
169382407d
2 changed files with 270 additions and 0 deletions
BIN
tex/misc/w15_slides.pdf
Normal file
BIN
tex/misc/w15_slides.pdf
Normal file
Binary file not shown.
270
tex/misc/w15_slides.tex
Normal file
270
tex/misc/w15_slides.tex
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
\documentclass{beamer}
|
||||
\usepackage[]{biblatex}
|
||||
\usepackage[export]{adjustbox}
|
||||
|
||||
\title{
|
||||
Cache Coherency \& Memory Model in RDMA-Backed Software-Coherent DSM
|
||||
}
|
||||
\author{Zhengyi Chen}
|
||||
\date{\today}
|
||||
|
||||
\addbibresource{../main.bib}
|
||||
|
||||
\begin{document}
|
||||
% Title Page
|
||||
\frame{\titlepage}
|
||||
|
||||
% Table of Content
|
||||
\begin{frame}
|
||||
\frametitle{Table of Contents}
|
||||
\tableofcontents
|
||||
\end{frame}
|
||||
|
||||
% Part 1: Overview
|
||||
% =============================================================================
|
||||
\section{1. Overview}
|
||||
% Page 1
|
||||
\begin{frame}
|
||||
\frametitle{1. Overview}
|
||||
\begin{itemize}
|
||||
\item {
|
||||
DSM used to be constrained by NIC bandwidth \& transfer rate (e.g.,
|
||||
during the 1990s).
|
||||
}
|
||||
\item {
|
||||
The advent of high(er) transfer rate NICs allows the DSM idea to be
|
||||
revived.
|
||||
}
|
||||
\item {
|
||||
Orthogonally, hardware acceleration resources are scarce and highly
|
||||
valuable.
|
||||
\begin{itemize}
|
||||
\item {
|
||||
Traditional Scheduling Mechanisms within a Cluster cannot
|
||||
dynamically allocate hardware accelerators without high
|
||||
overhead.
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\item {
|
||||
Ideally, via high-speed NICs, hardware accelerator could be
|
||||
statically allocated such that:
|
||||
\begin{itemize}
|
||||
\item {
|
||||
Every node have access to the hardware accelerator node in a
|
||||
time-shared fashion.
|
||||
}
|
||||
\item {
|
||||
Accelerator-attached node can access remote memory much like
|
||||
attaching accelerator over, say, PCIe.
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Heterogeneous Memory Management}
|
||||
\begin{itemize}
|
||||
\item {
|
||||
\textbf{HMM} facilitates shared address space and transparent data
|
||||
migration between CPU and peripherals. Specifically:
|
||||
\begin{itemize}
|
||||
\item {
|
||||
HMM provides interface for duplicating the CPU page table
|
||||
with that of the device's, which are transparently
|
||||
synchronized.
|
||||
}
|
||||
\item {
|
||||
It also provides corresponding \texttt{struct page}
|
||||
representation of device memory pages, which are faulted
|
||||
between the CPU and device.
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\item {
|
||||
Theoretically, this should allow for devices in remote nodes to
|
||||
perform HMM using the DMA-capable NIC as a ``proxy HMM device''.
|
||||
}
|
||||
\item {
|
||||
Details of implementation of DSM-over-HMM is beyond this thesis's
|
||||
scope.
|
||||
\begin{itemize}
|
||||
\item {
|
||||
This thesis focuses on studying and implementing cache
|
||||
coherency and later, memory model for the DSM subsystem of
|
||||
this wider, ongoing project.
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Cache Coherency, and Why It Matters Here}
|
||||
\begin{itemize}
|
||||
\item {
|
||||
Cache-incoherent RDMA (e.g., mlx) performs DMA without
|
||||
synchronization with CPU cache.
|
||||
}
|
||||
\item {
|
||||
We cannot assume MMU to magically automatically maintain coherence.
|
||||
}
|
||||
\item {
|
||||
At transportation time:
|
||||
\begin{itemize}
|
||||
\item {
|
||||
Send to remote: flushes cache into memory before posting
|
||||
send message.
|
||||
}
|
||||
\item {
|
||||
Receive from remote: invalidate cache entry after worked
|
||||
recv message.
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\item {
|
||||
Example: Linux kernel tree, \textit{smbdirect} implementation.
|
||||
\begin{itemize}
|
||||
\item {
|
||||
\textit{smbdirect} opportunistically establish SMB over
|
||||
RDMA-capable network.
|
||||
}
|
||||
\item {
|
||||
\texttt{smbd\_post\_send} cleans cache entry prior to posting
|
||||
send request.
|
||||
}
|
||||
\item {
|
||||
\texttt{recv\_done} invalidates cache entry after exiting
|
||||
softirq for recv request (as callback from RDMA driver).
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Consistency Model and Protocol}
|
||||
\begin{itemize}
|
||||
\item {
|
||||
Majority of DSM literatures apply \textbf{release consistency} as
|
||||
the system's memory model.
|
||||
}
|
||||
\item {
|
||||
With \textbf{single-writer} protocol, however, the memory model can
|
||||
be strengthened with little increase in code complexity.
|
||||
\begin{itemize}
|
||||
\item {
|
||||
\textit{DSPM}\cite{shan2017distributed}, for example,
|
||||
achieves a \textit{de-facto} TSO consistency from its
|
||||
multi-writer release consistency counterpart -- assuming
|
||||
correct memory barriers within each node's CPU, distributed
|
||||
writes are never reordered, and distributed reads can
|
||||
overtake writes.
|
||||
}
|
||||
\item {
|
||||
Consequently, one can easily achieve sequential consistency
|
||||
by designating the entire write-access duration as a critical
|
||||
section.
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\item {
|
||||
HMM's ``CPU-or-device'' data migration model also strongly implies
|
||||
a single-writer consistency protocol.
|
||||
}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% Part 2: Design
|
||||
% =============================================================================
|
||||
\section{2. Design}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. Design}
|
||||
\begin{itemize}
|
||||
\item {
|
||||
Designing a DSM necessitates designing:
|
||||
\begin{itemize}
|
||||
\item Consistency Model.
|
||||
\item Coherence Protocol and State Machine.
|
||||
\item Access Control.
|
||||
\end{itemize}
|
||||
}
|
||||
\item {
|
||||
Care needs to be taken to ensure that the in-kernel implementation
|
||||
is:
|
||||
\begin{itemize}
|
||||
\item Correct,
|
||||
\item Performant,
|
||||
\item Exploits RDMA's traits.
|
||||
\end{itemize}
|
||||
}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Consistency Model}
|
||||
|
||||
|
||||
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Coherence Protocol}
|
||||
|
||||
|
||||
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Stateful Nodes}
|
||||
|
||||
|
||||
|
||||
\end{frame}
|
||||
|
||||
% Part 3: Progress
|
||||
% =============================================================================
|
||||
\section{3. Progress}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Progress}
|
||||
|
||||
|
||||
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{On-demand Coherency in ARM64}
|
||||
\begin{itemize}
|
||||
\item {
|
||||
ARMv8 defines two levels of cache coherence:
|
||||
\begin{itemize}
|
||||
\item {
|
||||
\textit{Point-of-Unification}:
|
||||
}
|
||||
\item {
|
||||
\textit{Point-of-Coherence}:
|
||||
}
|
||||
\end{itemize}
|
||||
}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Kernel Patch for On-demand Coherency}
|
||||
|
||||
|
||||
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Proof-of-Concept Kernel Module}
|
||||
|
||||
|
||||
|
||||
\end{frame}
|
||||
|
||||
% References
|
||||
\end{document}
|
||||
Loading…
Add table
Add a link
Reference in a new issue