diff --git a/tex/misc/viva_slides.pdf b/tex/misc/viva_slides.pdf new file mode 100644 index 0000000..cc08372 Binary files /dev/null and b/tex/misc/viva_slides.pdf differ diff --git a/tex/misc/viva_slides.tex b/tex/misc/viva_slides.tex new file mode 100644 index 0000000..4d8ff38 --- /dev/null +++ b/tex/misc/viva_slides.tex @@ -0,0 +1,141 @@ +\documentclass{beamer} + +\title{Analysis of Software-Maintained Cache Coherency in ARMv8-A for Cross-Architectural DSM Systems: The Quintessential} +\author{Zhengyi Chen} +\date{\today} + +\begin{document} +\frame{\titlepage} + +\begin{frame} + \frametitle{Why study this, specifically?} + \begin{itemize} + \item { + Amir Noohi (Prof. Barbalace's PhD student) currently works on in-kernel distributed shared memory system implemented over RDMA. + } + \item { + We agree that cache coherency implementation and overhead is an important consideration for such a system to be cross-architectural (e.g., x86 vs. ARM64). + } + \item { + Unlike x86, ARM64 (as well as e.g. RISC-V) does not guarantee hardware cache coherence: + \begin{itemize} + \item { + ARM's ``SystemReady'' program requires such CPU/SoCs to be cache coherent at hardware level. + } + \item { + However, to my knowledge this is not the case across all ARM SoCs, especially as ARM64 PCs are becoming more prevalent. + } + \end{itemize} + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Contributions} + In this paper, I: + \begin{itemize} + \item { + Identified and exposed the in-Linux-kernel cache coherence maintenance implementation for ARM64 architecture (as used by e.g. RDMA drivers); + } + \item { + Wrote a kernel module to perform performance tests on exposed mechanism, over both virtualized and server setups. + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Cache Coherence in ARM64 \& Linux Kernel} + \begin{itemize} + \item { + ARMv8-A/R defines \textit{Point-of-Coherence} (\textit{PoC}) as the point at which all observers of memory will observe the same copy of a memory location. + } + \item { + In Linux kernel (v6.7.0), this was in turn implemented as assembly macro \texttt{dcache\_[clean|inval]\_poc}, inside \textit{arch/arm64/mm/cache.S}. + \begin{itemize} + \item { + This was then called by Linux kernel's DMA API, e.g. \texttt{dma\_sync\_single\_for\_cpu}. + } + \end{itemize} + } + \item { + For testing purposes, expose the assembly macro inside a C function symbol wrapper for dynamic ftrace support. + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Experiment Setup: Kernel Module} + \begin{itemize} + \item { + A simple kernel module for shared memory is written to test the latency of the exposed assembly macro via ftrace/bpf. + } + \item { + A character device is exposed to userspace with \texttt{mmap} support. + \begin{itemize} + \item { + Allocation size on driver end can be adjusted dynamically for testing cache coherence latency over variable-sized contiguous allocation. + } + \item { + Userspace programs can then adjust size of \texttt{mmap} for testing cache coherence latency over non-contiguous allocations. + } + \end{itemize} + } + \item { + On \texttt{.close} (e.g. on \texttt{munmap} by userspace), allocations are flushed for \textit{PoC} coherency\dots + \begin{itemize} + \item As is the case for DMA memory, described prior. + \end{itemize} + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Experiment Setup: Testbench} + \begin{itemize} + \item { + Tests conducted mostly on QEMU virt-8.0 platform on x86 host. + } + \item { + Some tests conducted on \textit{Ampere Altra} server system\dots + \begin{itemize} + \item { + However, \textit{Ampere Altra} is \textit{SystemReady}-certified -- i.e., supports hardware level cache coherence. + } + \item { + Latencies collected on this system hence may be non-representative of real performance. + } + \end{itemize} + } + \item { + Ideally wider range of test setups should be explored beyond the contributions of this paper. + } + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Results: Summary} + \begin{itemize} + \item { + Constant allocation size, variable \texttt{mmap} size: no significant difference in per-contiguous-memory-area cache coherence latency. + } + \item { + Variable allocation size: + \begin{itemize} + \item { + Latency \textbf{does not} grow linearly with increase in contiguous allocation size. + } + \item { + In general, latency remains within the same order-of-magnitude up to $2^6$ contiguous pages. + } + \item { + For larger contiguous pages, latency due to cache coherence may be amortized by less allocations and page fault handlings required (implementation-specific?). + } + \end{itemize} + } + \item { + In general, a hypothetical DSM system should prefer using larger contiguous allocations (which seems logical). + } + \end{itemize} +\end{frame} + +\end{document} \ No newline at end of file