..

2024-03-16 23:42:43 +00:00 · 2024-03-16 23:42:43 +00:00 · 6bed890643
commit 6bed890643
parent 0f49bbabcd
3 changed files with 616 additions and 35 deletions
--- a/tex/draft/skeleton.tex
+++ b/tex/draft/skeleton.tex
@ -10,7 +10,7 @@
 % you should catch most accidental changes of page layout though.

 \usepackage{microtype} % recommended, but you can remove if it causes problems
-% \usepackage{natbib} % recommended for citations
+% \usepackage{natbib} % recommended for citations % but I have no experience with natbib...
 \usepackage[utf8]{inputenc}
 \usepackage[dvipsnames]{xcolor}
 \usepackage{hyperref}
@ -18,7 +18,7 @@
 \usepackage{graphicx}
 \usepackage[english]{babel}
 % -> biblatex
-\usepackage{biblatex} % full of mischief
+\usepackage{biblatex}
 \addbibresource{mybibfile.bib}
 % <- biblatex
 % -> nice definition listings
@ -30,8 +30,12 @@
 % -> code listing
 % [!] Requires external program: pypi:pygment
 \usepackage{minted}
-\usemintedstyle{vs}
+\usemintedstyle{xcode}
+\definecolor{code-bg}{rgb}{0.98, 0.98, 0.99}
 % <- code listing
+% -> draw textbook-style frames
+\usepackage{mdframed}
+% <- frames

 \begin{document}
 \begin{preliminary}
@ -64,12 +68,7 @@
 \date{\today}

 \abstract{
-This skeleton demonstrates how to use the \texttt{infthesis} style for
-undergraduate dissertations in the School of Informatics. It also emphasises the
-page limit, and that you must not deviate from the required style.
-The file \texttt{skeleton.tex} generates this document and should be used as a
-starting point for your thesis. Replace this abstract text with a concise
-summary of your report.
+    \textcolor{red}{To be done\dots}
 }

 \maketitle
@ -339,13 +338,14 @@ Using these definitions, a vendor could build \textit{heterogeneous} and \textit
 \end{definition}

 \subsection{ARMv8-A Software Cache Coherence in Linux Kernel}
+\label{subsec:armv8a-swcoherency}
 Because of the lack of hardware guarantee on hardware DMA coherency (though such support exists \cite{Parris.AMBA_4_ACE-Lite.2013}), programmers need to invoke architecture-specific cache-coherency instructions when porting DMA hardware support over a diverse range of ARMv8 microarchitectures, often encapsulated in problem-specific subroutines.

 Notably, kernel (driver) programming warrants programmer attention to software-maintained coherency when userspace programmers downstream expect data-flow, interspersed between CPU and DMA operations, to follow program ordering and (driver vendor) specifications. One such example arises in the Linux kernel implementation of DMA memory management API \cite{Miller_Henderson_Jelinek.Kernelv6.7-DMA_guide.2024}\footnote[1]{Based on Linux kernel v6.7.0.}:

 \begin{definition}[DMA Mappings]
    The Linux kernel DMA memory allocation API, imported via
-    \begin{minted}[linenos]{c}
+    \begin{minted}[linenos, bgcolor=code-bg]{c}
 #include <linux/dma-mapping.h>
    \end{minted}
    defines two variants of DMA mappings:
@ -370,7 +370,7 @@ Notably, kernel (driver) programming warrants programmer attention to software-m
 Consistent DMA mappings could be trivially created via allocating non-cacheable memory, which guarantees \textit{PoC} for all memory observers (though system-specific fastpaths exist).

 On the other hand, streaming DMA mappings require manual synchronization upon programmed CPU/DMA access. Take single-buffer synchronization on CPU after DMA access for example:
-\begin{minted}[linenos, mathescape]{c}
+\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
 /* In kernel/dma/mapping.c $\label{code:dma_sync_single_for_cpu}$*/
 void dma_sync_single_for_cpu(
    struct device *dev,          // kernel repr for DMA device
@ -386,11 +386,11 @@ void dma_sync_single_for_cpu(
        arch_sync_dma_for_cpu_all(); // MIPS quirks...
    }

-    /* Miscellaneous cases... */
+    /* Miscellaneous cases...*/
 }
 \end{minted}

-\begin{minted}[linenos]{c}
+\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
 /* In arch/arm64/mm/dma-mapping.c */
 void arch_sync_dma_for_cpu(
    phys_addr_t paddr,
@ -411,7 +411,7 @@ void arch_sync_dma_for_cpu(

 This call-chain, as well as its mirror case which maintains cache coherency for the DMA device after CPU access: \mint[breaklines=true]{c}|dma_sync_single_for_device(struct device *, dma_addr_t, size_t, enum dma_data_direction)|, call into the following procedures, respectively:

-\begin{minted}[linenos]{c}
+\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
 /* Exported @ arch/arm64/include/asm/cacheflush.h */
 /* Defined @ arch/arm64/mm/cache.S */
 /* All functions accept virtual start, end addresses. */
@ -425,7 +425,7 @@ extern void dcache_inval_poc(
    unsigned long start, unsigned long end
 );

-/* Clean data cache region [start, end) to PoC.
+/* Clean data cache region [start, end) to PoC. $\ref{code:dcache_clean_poc}$
 *
 * Write-back CPU cache entries that intersect with [start, end),
 * such that data from CPU becomes visible to external writers.
@ -438,7 +438,7 @@ extern void dcache_clean_poc(
 \subsubsection{Addendum: \texttt{enum dma\_data\_direction}}

 The Linux kernel defines 4 direction \texttt{enum} values for fine-tuning synchronization behaviors:
-\begin{minted}[linenos]{c}
+\begin{minted}[linenos, bgcolor=code-bg]{c}
 /* In include/linux/dma-direction.h */
 enum dma_data_direction {
    DMA_BIDIRECTION = 0, // data transfer direction uncertain.
@ -452,25 +452,21 @@ These values allow for certain fast-paths to be taken at runtime. For example, \

 % TODO: Move to addendum section.
 \subsubsection{Use-case: Kernel-space \textit{SMBDirect} Driver}
-\textit{SMBDirect} is an extension of the \textit{SMB} (\textit{Server Message Block}) protocol for opportunistically establishing the communication protocol over RDMA-capable network interfaces \cite{many.MSFTLearn-SMBDirect.2024}.
+An example of cache-coherent in-kernel RDMA networking module over heterogeneous ISAs could be found in the Linux implementation of \textit{SMBDirect}. \textit{SMBDirect} is an extension of the \textit{SMB} (\textit{Server Message Block}) protocol for opportunistically establishing the communication protocol over RDMA-capable network interfaces \cite{many.MSFTLearn-SMBDirect.2024}.

 We focus on two procedures inside the in-kernel SMBDirect implementation:

 \paragraph{Before send: \texttt{smbd\_post\_send}}
-\begin{minted}[linenos]{c}
+\texttt{smbd\_post\_send} is a function downstream of the call-chain of \texttt{smbd\_send}, which sends SMBDirect payload for transport over network. Payloads are constructed and batched for maximized bandwidth, then \texttt{smbd\_post\_send} is called to signal the RDMA NIC for transport.
+
+The function body is roughly as follows:
+\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
 /* In fs/smb/client/smbdirect.c */
 static int smbd_post_send(
    struct smbd_connection *info, // SMBDirect transport context
    struct smbd_request *request, // SMBDirect request context
-) // ...
-\end{minted}
-
-Downstream of \texttt{smbd\_send}, which sends SMBDirect payload for transport over network. Payloads are constructed and batched for maximized bandwidth, then \texttt{smbd\_post\_send} is called to signal the RDMA NIC for transport.
-
-The function body is roughly as follows:
-\begin{minted}[linenos, firstnumber=last, mathescape]{c}
-{
-    struct ib_send_wr send_wr; // "Write Request" for entire payload
+) {
+    struct ib_send_wr send_wr; // Ib "Write Request" for payload
    int rc, i;

    /* For each message in batched payload */
@ -503,18 +499,16 @@ The function body is roughly as follows:
 Line \ref{code:ib_dma_sync_single_for_device} writes back CPU cache lines to be visible for RDMA NIC in preparation for DMA operations when the posted \textit{send request} is worked upon.

 \paragraph{Upon reception: \texttt{recv\_done}}
-\begin{minted}[linenos]{c}
+\texttt{recv\_done} is called when the RDMA subsystem works on the received payload over RDMA.
+
+Mirroring the case for \texttt{smbd\_post\_send}, it invalidates CPU cache lines for DMA-ed data to be visible at CPU cores prior to any operations on received data:
+
+\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
 /* In fs/smb/client/smbdirect.c */
 static void recv_done(
    struct ib_cq *cq, // "Completion Queue"
    struct ib_wc *wc, // "Work Completion"
-) // ...
-\end{minted}
-
-Called when the RDMA subsystem works on the received payload over RDMA. Mirroring the case for \texttt{smbd\_post\_send}, it invalidates CPU cache lines for DMA-ed data to be visible at CPU cores prior to any operations on received data:
-
-\begin{minted}[linenos, firstnumber=last, mathescape]{c}
-{
+) {
    struct smbd_data_transfer *data_transfer;
    struct smbd_response *response = container_of(
        wc->wr_cqe,           // ptr: pointer to member
@ -539,9 +533,83 @@ Called when the RDMA subsystem works on the received payload over RDMA. Mirrorin
 \end{minted}

 \chapter{Software Coherency Latency}
+Coherency must be maintained at software level when hardware cache coherency cannot be guaranteed for some specific ISA (as established in subsection \ref{subsec:armv8a-swcoherency}). There is, therefore, interest in knowing the latency of coherence-maintenance operations for performance engineering purposes, for example OS jitter analysis for scientific computing in heterogeneous clusters and, more pertinently, comparative analysis between software and hardware-backed DSM systems (e.g. \cites{Masouros_etal.Adrias.2023}{Wang_etal.Concordia.2021}).
+
+The purpose of this chapter is hence to provide a statistical analysis over software coherency latency in ARM64 systems by instrumenting hypothetical scenarios of software-initiated coherence maintenance in ARM64 test-benches.
+
+The rest of the chapter is structured as follows:
+\begin{itemize}
+    \item {
+        \hyperref[sec:sw-coherency-setup]{\textbf{Experiment Setup}} covers the test-benches used for instrumentation, including the kernel version, distribution, and the specifications of the instrumented (bare-metal/virtual) machine.
+    }
+    \item {
+        \hyperref[sec:sw-coherency-method]{\textbf{Methodology}} covers the kernel module and workload used for instrumentation and experimentation, including changes made to the kernel, the kernel module, and userspace programs used for experimentation.
+    }
+    \item {
+        \hyperref[sec:sw-coherency-results]{\textbf{Results}} covers the results gathered during instrumentation from various test-benches, segmented by experiment.
+    }
+    \item {
+        \hyperref[sec:sw-coherency-discuss]{\textbf{Discussion}} identifies key insights from experimental results, as well as deficiencies in research method and possible directions of future works.
+    }
+\end{itemize}
+
+\section{Experiment Setup}\label{sec:sw-coherency-setup}
+\subsection{QEMU-over-x86: \texttt{star}}
+The primary source of experimental data come from a virtualized machine: a virtualized guest running a lightly-customized Linux v6.7.0 preemptive kernel with standard non-graphical Debian 12 distribution installed to provide userspace support. The specifics of this QEMU-emulated ARM64 test-bench, running atop of an x86-64 host PC, is at \ref{table:2}.
+
+\begin{table}[h]
+    \centering
+    \begin{tabular}{|c|c|}
+        \hline
+        Processors & 3x QEMU virt-8.2 (2-way SMT; emulates Cortex-A76) \\
+        \hline
+        CPU Flags &
+        \begin{tabular}{@{}cccccc@{}}
+            % 1         2       3       4           5           6
+            fp       & asimd & evtstrm & aes     & pmull   & sha1  \\
+            sha2     & crc32 & atomics & fphp    & asimdhp & cpuid \\
+            asimdrdm & lrcpc & dcpop   & asimddp &         &       \\
+        \end{tabular} \\
+        \hline
+        NUMA Nodes & 1: $\{P_0, \dots, P_5\}$ \\
+        \hline
+        Memory & 4GiB \\
+        \hline
+    \end{tabular}
+    \caption{Specification of \texttt{star}}
+    \label{table:2}
+\end{table}
+
+\begin{table}[h]
+    \centering
+    \begin{tabular}{|c|c|}
+        \hline
+    \end{tabular}
+    \caption{Specification of Host}
+    \label{table:3}
+\end{table}
+
+\subsection{\textit{Neoverse N1}: \texttt{rose}}
+% - QEMU-over-x86; preemptive-on-preemptive
+% - Native server-ready ARM64 (preemptive), which I didn't run for long ngl
+
+\section{Methodology}\label{sec:sw-coherency-method}
+\subsection{Exporting \texttt{dcache\_clean\_poc}}
+\subsection{Kernel Module: \texttt{my\_shmem}}
+\subsection{Instrumentation: \texttt{ftrace} and \textit{eBPF}}
+\subsection{Userspace Programs}
+
+\section{Results}\label{sec:sw-coherency-results}
+\subsection{Controlled Allocation Size; Variable Page Count}
+\subsection{Controlled Page Count; Variable Allocation Size}
+
+\section{Discussion}\label{sec:sw-coherency-discuss}
+% - you should also measure the access latency after coherency operation, though this is impl-specific (e.g., one vendor can have a simple PoC mechanism where e.g. you have a shared L2-cache that is snooped by DMA engine, hence flush to L2-cache and call it a day for PoC; but another can just as well call main mem the PoC, dep. on impl.)

 \chapter{DSM System Design}

+\chapter{Summary}
+
 % \bibliographystyle{plain}
 % \bibliographystyle{plainnat}
 % \bibliography{mybibfile}