..
This commit is contained in:
parent
0f49bbabcd
commit
6bed890643
3 changed files with 616 additions and 35 deletions
Binary file not shown.
|
|
@ -10,7 +10,7 @@
|
|||
% you should catch most accidental changes of page layout though.
|
||||
|
||||
\usepackage{microtype} % recommended, but you can remove if it causes problems
|
||||
% \usepackage{natbib} % recommended for citations
|
||||
% \usepackage{natbib} % recommended for citations % but I have no experience with natbib...
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[dvipsnames]{xcolor}
|
||||
\usepackage{hyperref}
|
||||
|
|
@ -18,7 +18,7 @@
|
|||
\usepackage{graphicx}
|
||||
\usepackage[english]{babel}
|
||||
% -> biblatex
|
||||
\usepackage{biblatex} % full of mischief
|
||||
\usepackage{biblatex}
|
||||
\addbibresource{mybibfile.bib}
|
||||
% <- biblatex
|
||||
% -> nice definition listings
|
||||
|
|
@ -30,8 +30,12 @@
|
|||
% -> code listing
|
||||
% [!] Requires external program: pypi:pygment
|
||||
\usepackage{minted}
|
||||
\usemintedstyle{vs}
|
||||
\usemintedstyle{xcode}
|
||||
\definecolor{code-bg}{rgb}{0.98, 0.98, 0.99}
|
||||
% <- code listing
|
||||
% -> draw textbook-style frames
|
||||
\usepackage{mdframed}
|
||||
% <- frames
|
||||
|
||||
\begin{document}
|
||||
\begin{preliminary}
|
||||
|
|
@ -64,12 +68,7 @@
|
|||
\date{\today}
|
||||
|
||||
\abstract{
|
||||
This skeleton demonstrates how to use the \texttt{infthesis} style for
|
||||
undergraduate dissertations in the School of Informatics. It also emphasises the
|
||||
page limit, and that you must not deviate from the required style.
|
||||
The file \texttt{skeleton.tex} generates this document and should be used as a
|
||||
starting point for your thesis. Replace this abstract text with a concise
|
||||
summary of your report.
|
||||
\textcolor{red}{To be done\dots}
|
||||
}
|
||||
|
||||
\maketitle
|
||||
|
|
@ -339,13 +338,14 @@ Using these definitions, a vendor could build \textit{heterogeneous} and \textit
|
|||
\end{definition}
|
||||
|
||||
\subsection{ARMv8-A Software Cache Coherence in Linux Kernel}
|
||||
\label{subsec:armv8a-swcoherency}
|
||||
Because of the lack of hardware guarantee on hardware DMA coherency (though such support exists \cite{Parris.AMBA_4_ACE-Lite.2013}), programmers need to invoke architecture-specific cache-coherency instructions when porting DMA hardware support over a diverse range of ARMv8 microarchitectures, often encapsulated in problem-specific subroutines.
|
||||
|
||||
Notably, kernel (driver) programming warrants programmer attention to software-maintained coherency when userspace programmers downstream expect data-flow, interspersed between CPU and DMA operations, to follow program ordering and (driver vendor) specifications. One such example arises in the Linux kernel implementation of DMA memory management API \cite{Miller_Henderson_Jelinek.Kernelv6.7-DMA_guide.2024}\footnote[1]{Based on Linux kernel v6.7.0.}:
|
||||
|
||||
\begin{definition}[DMA Mappings]
|
||||
The Linux kernel DMA memory allocation API, imported via
|
||||
\begin{minted}[linenos]{c}
|
||||
\begin{minted}[linenos, bgcolor=code-bg]{c}
|
||||
#include <linux/dma-mapping.h>
|
||||
\end{minted}
|
||||
defines two variants of DMA mappings:
|
||||
|
|
@ -370,7 +370,7 @@ Notably, kernel (driver) programming warrants programmer attention to software-m
|
|||
Consistent DMA mappings could be trivially created via allocating non-cacheable memory, which guarantees \textit{PoC} for all memory observers (though system-specific fastpaths exist).
|
||||
|
||||
On the other hand, streaming DMA mappings require manual synchronization upon programmed CPU/DMA access. Take single-buffer synchronization on CPU after DMA access for example:
|
||||
\begin{minted}[linenos, mathescape]{c}
|
||||
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
|
||||
/* In kernel/dma/mapping.c $\label{code:dma_sync_single_for_cpu}$*/
|
||||
void dma_sync_single_for_cpu(
|
||||
struct device *dev, // kernel repr for DMA device
|
||||
|
|
@ -386,11 +386,11 @@ void dma_sync_single_for_cpu(
|
|||
arch_sync_dma_for_cpu_all(); // MIPS quirks...
|
||||
}
|
||||
|
||||
/* Miscellaneous cases... */
|
||||
/* Miscellaneous cases...*/
|
||||
}
|
||||
\end{minted}
|
||||
|
||||
\begin{minted}[linenos]{c}
|
||||
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
|
||||
/* In arch/arm64/mm/dma-mapping.c */
|
||||
void arch_sync_dma_for_cpu(
|
||||
phys_addr_t paddr,
|
||||
|
|
@ -411,7 +411,7 @@ void arch_sync_dma_for_cpu(
|
|||
|
||||
This call-chain, as well as its mirror case which maintains cache coherency for the DMA device after CPU access: \mint[breaklines=true]{c}|dma_sync_single_for_device(struct device *, dma_addr_t, size_t, enum dma_data_direction)|, call into the following procedures, respectively:
|
||||
|
||||
\begin{minted}[linenos]{c}
|
||||
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
|
||||
/* Exported @ arch/arm64/include/asm/cacheflush.h */
|
||||
/* Defined @ arch/arm64/mm/cache.S */
|
||||
/* All functions accept virtual start, end addresses. */
|
||||
|
|
@ -425,7 +425,7 @@ extern void dcache_inval_poc(
|
|||
unsigned long start, unsigned long end
|
||||
);
|
||||
|
||||
/* Clean data cache region [start, end) to PoC.
|
||||
/* Clean data cache region [start, end) to PoC. $\ref{code:dcache_clean_poc}$
|
||||
*
|
||||
* Write-back CPU cache entries that intersect with [start, end),
|
||||
* such that data from CPU becomes visible to external writers.
|
||||
|
|
@ -438,7 +438,7 @@ extern void dcache_clean_poc(
|
|||
\subsubsection{Addendum: \texttt{enum dma\_data\_direction}}
|
||||
|
||||
The Linux kernel defines 4 direction \texttt{enum} values for fine-tuning synchronization behaviors:
|
||||
\begin{minted}[linenos]{c}
|
||||
\begin{minted}[linenos, bgcolor=code-bg]{c}
|
||||
/* In include/linux/dma-direction.h */
|
||||
enum dma_data_direction {
|
||||
DMA_BIDIRECTION = 0, // data transfer direction uncertain.
|
||||
|
|
@ -452,25 +452,21 @@ These values allow for certain fast-paths to be taken at runtime. For example, \
|
|||
|
||||
% TODO: Move to addendum section.
|
||||
\subsubsection{Use-case: Kernel-space \textit{SMBDirect} Driver}
|
||||
\textit{SMBDirect} is an extension of the \textit{SMB} (\textit{Server Message Block}) protocol for opportunistically establishing the communication protocol over RDMA-capable network interfaces \cite{many.MSFTLearn-SMBDirect.2024}.
|
||||
An example of cache-coherent in-kernel RDMA networking module over heterogeneous ISAs could be found in the Linux implementation of \textit{SMBDirect}. \textit{SMBDirect} is an extension of the \textit{SMB} (\textit{Server Message Block}) protocol for opportunistically establishing the communication protocol over RDMA-capable network interfaces \cite{many.MSFTLearn-SMBDirect.2024}.
|
||||
|
||||
We focus on two procedures inside the in-kernel SMBDirect implementation:
|
||||
|
||||
\paragraph{Before send: \texttt{smbd\_post\_send}}
|
||||
\begin{minted}[linenos]{c}
|
||||
\texttt{smbd\_post\_send} is a function downstream of the call-chain of \texttt{smbd\_send}, which sends SMBDirect payload for transport over network. Payloads are constructed and batched for maximized bandwidth, then \texttt{smbd\_post\_send} is called to signal the RDMA NIC for transport.
|
||||
|
||||
The function body is roughly as follows:
|
||||
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
|
||||
/* In fs/smb/client/smbdirect.c */
|
||||
static int smbd_post_send(
|
||||
struct smbd_connection *info, // SMBDirect transport context
|
||||
struct smbd_request *request, // SMBDirect request context
|
||||
) // ...
|
||||
\end{minted}
|
||||
|
||||
Downstream of \texttt{smbd\_send}, which sends SMBDirect payload for transport over network. Payloads are constructed and batched for maximized bandwidth, then \texttt{smbd\_post\_send} is called to signal the RDMA NIC for transport.
|
||||
|
||||
The function body is roughly as follows:
|
||||
\begin{minted}[linenos, firstnumber=last, mathescape]{c}
|
||||
{
|
||||
struct ib_send_wr send_wr; // "Write Request" for entire payload
|
||||
) {
|
||||
struct ib_send_wr send_wr; // Ib "Write Request" for payload
|
||||
int rc, i;
|
||||
|
||||
/* For each message in batched payload */
|
||||
|
|
@ -503,18 +499,16 @@ The function body is roughly as follows:
|
|||
Line \ref{code:ib_dma_sync_single_for_device} writes back CPU cache lines to be visible for RDMA NIC in preparation for DMA operations when the posted \textit{send request} is worked upon.
|
||||
|
||||
\paragraph{Upon reception: \texttt{recv\_done}}
|
||||
\begin{minted}[linenos]{c}
|
||||
\texttt{recv\_done} is called when the RDMA subsystem works on the received payload over RDMA.
|
||||
|
||||
Mirroring the case for \texttt{smbd\_post\_send}, it invalidates CPU cache lines for DMA-ed data to be visible at CPU cores prior to any operations on received data:
|
||||
|
||||
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
|
||||
/* In fs/smb/client/smbdirect.c */
|
||||
static void recv_done(
|
||||
struct ib_cq *cq, // "Completion Queue"
|
||||
struct ib_wc *wc, // "Work Completion"
|
||||
) // ...
|
||||
\end{minted}
|
||||
|
||||
Called when the RDMA subsystem works on the received payload over RDMA. Mirroring the case for \texttt{smbd\_post\_send}, it invalidates CPU cache lines for DMA-ed data to be visible at CPU cores prior to any operations on received data:
|
||||
|
||||
\begin{minted}[linenos, firstnumber=last, mathescape]{c}
|
||||
{
|
||||
) {
|
||||
struct smbd_data_transfer *data_transfer;
|
||||
struct smbd_response *response = container_of(
|
||||
wc->wr_cqe, // ptr: pointer to member
|
||||
|
|
@ -539,9 +533,83 @@ Called when the RDMA subsystem works on the received payload over RDMA. Mirrorin
|
|||
\end{minted}
|
||||
|
||||
\chapter{Software Coherency Latency}
|
||||
Coherency must be maintained at software level when hardware cache coherency cannot be guaranteed for some specific ISA (as established in subsection \ref{subsec:armv8a-swcoherency}). There is, therefore, interest in knowing the latency of coherence-maintenance operations for performance engineering purposes, for example OS jitter analysis for scientific computing in heterogeneous clusters and, more pertinently, comparative analysis between software and hardware-backed DSM systems (e.g. \cites{Masouros_etal.Adrias.2023}{Wang_etal.Concordia.2021}).
|
||||
|
||||
The purpose of this chapter is hence to provide a statistical analysis over software coherency latency in ARM64 systems by instrumenting hypothetical scenarios of software-initiated coherence maintenance in ARM64 test-benches.
|
||||
|
||||
The rest of the chapter is structured as follows:
|
||||
\begin{itemize}
|
||||
\item {
|
||||
\hyperref[sec:sw-coherency-setup]{\textbf{Experiment Setup}} covers the test-benches used for instrumentation, including the kernel version, distribution, and the specifications of the instrumented (bare-metal/virtual) machine.
|
||||
}
|
||||
\item {
|
||||
\hyperref[sec:sw-coherency-method]{\textbf{Methodology}} covers the kernel module and workload used for instrumentation and experimentation, including changes made to the kernel, the kernel module, and userspace programs used for experimentation.
|
||||
}
|
||||
\item {
|
||||
\hyperref[sec:sw-coherency-results]{\textbf{Results}} covers the results gathered during instrumentation from various test-benches, segmented by experiment.
|
||||
}
|
||||
\item {
|
||||
\hyperref[sec:sw-coherency-discuss]{\textbf{Discussion}} identifies key insights from experimental results, as well as deficiencies in research method and possible directions of future works.
|
||||
}
|
||||
\end{itemize}
|
||||
|
||||
\section{Experiment Setup}\label{sec:sw-coherency-setup}
|
||||
\subsection{QEMU-over-x86: \texttt{star}}
|
||||
The primary source of experimental data come from a virtualized machine: a virtualized guest running a lightly-customized Linux v6.7.0 preemptive kernel with standard non-graphical Debian 12 distribution installed to provide userspace support. The specifics of this QEMU-emulated ARM64 test-bench, running atop of an x86-64 host PC, is at \ref{table:2}.
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{|c|c|}
|
||||
\hline
|
||||
Processors & 3x QEMU virt-8.2 (2-way SMT; emulates Cortex-A76) \\
|
||||
\hline
|
||||
CPU Flags &
|
||||
\begin{tabular}{@{}cccccc@{}}
|
||||
% 1 2 3 4 5 6
|
||||
fp & asimd & evtstrm & aes & pmull & sha1 \\
|
||||
sha2 & crc32 & atomics & fphp & asimdhp & cpuid \\
|
||||
asimdrdm & lrcpc & dcpop & asimddp & & \\
|
||||
\end{tabular} \\
|
||||
\hline
|
||||
NUMA Nodes & 1: $\{P_0, \dots, P_5\}$ \\
|
||||
\hline
|
||||
Memory & 4GiB \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{Specification of \texttt{star}}
|
||||
\label{table:2}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{|c|c|}
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{Specification of Host}
|
||||
\label{table:3}
|
||||
\end{table}
|
||||
|
||||
\subsection{\textit{Neoverse N1}: \texttt{rose}}
|
||||
% - QEMU-over-x86; preemptive-on-preemptive
|
||||
% - Native server-ready ARM64 (preemptive), which I didn't run for long ngl
|
||||
|
||||
\section{Methodology}\label{sec:sw-coherency-method}
|
||||
\subsection{Exporting \texttt{dcache\_clean\_poc}}
|
||||
\subsection{Kernel Module: \texttt{my\_shmem}}
|
||||
\subsection{Instrumentation: \texttt{ftrace} and \textit{eBPF}}
|
||||
\subsection{Userspace Programs}
|
||||
|
||||
\section{Results}\label{sec:sw-coherency-results}
|
||||
\subsection{Controlled Allocation Size; Variable Page Count}
|
||||
\subsection{Controlled Page Count; Variable Allocation Size}
|
||||
|
||||
\section{Discussion}\label{sec:sw-coherency-discuss}
|
||||
% - you should also measure the access latency after coherency operation, though this is impl-specific (e.g., one vendor can have a simple PoC mechanism where e.g. you have a shared L2-cache that is snooped by DMA engine, hence flush to L2-cache and call it a day for PoC; but another can just as well call main mem the PoC, dep. on impl.)
|
||||
|
||||
\chapter{DSM System Design}
|
||||
|
||||
\chapter{Summary}
|
||||
|
||||
% \bibliographystyle{plain}
|
||||
% \bibliographystyle{plainnat}
|
||||
% \bibliography{mybibfile}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue