unnamed_ba_thesis/tex/draft/skeleton.tex

% UG project example file, February 2022
%   A minor change in citation, September 2023 [HS]
% (ZCh.) typo in template smh
% Do not change the first two lines of code, except you may delete "logo," if causing problems.
% Understand any problems and seek approval before assuming it's ok to remove ugcheck.
\documentclass[logo,bsc,singlespacing,parskip]{infthesis}
\usepackage{ugcheck}

% Include any packages you need below, but don't include any that change the page
% layout or style of the dissertation. By including the ugcheck package above,
% you should catch most accidental changes of page layout though.

\usepackage{microtype} % recommended, but you can remove if it causes problems
% \usepackage{natbib} % recommended for citations % but I have no experience with natbib...
\usepackage[utf8]{inputenc}
\usepackage[dvipsnames]{xcolor}
\usepackage[justification=centering]{caption}
\usepackage{graphicx}
\usepackage[english]{babel}
\usepackage{float}
% -> biblatex
\usepackage{biblatex}
\addbibresource{mybibfile.bib}
% <- biblatex
% -> definition & quotes
\usepackage{csquotes}
\usepackage{amsthm}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
% <- definition & quotes
% -> code listing
% [!] Requires external program: pypi:pygment
\usepackage{minted}
\usemintedstyle{xcode}
\definecolor{code-bg}{rgb}{0.98, 0.98, 0.99}
% <- code listing
% -> draw textbook-style frames
\usepackage{mdframed}
% <- frames
% -> href (LOAD LAST)
\usepackage{hyperref}
% <- href
% -> subfigures
\usepackage{subcaption}
% <- subfigures
% -> font fix
\usepackage[T1]{fontenc}
% <- font fix

\begin{document}
\begin{preliminary}

\title{Analysis of Software-Maintained Cache Coherency in ARMv8-A for Cross-Architectural DSM Systems}

\author{Zhengyi Chen}

% CHOOSE YOUR DEGREE a):
% please leave just one of the following un-commented
% \course{Artificial Intelligence}
%\course{Artificial Intelligence and Computer Science}
%\course{Artificial Intelligence and Mathematics}
%\course{Artificial Intelligence and Software Engineering}
%\course{Cognitive Science}
\course{Computer Science}
%\course{Computer Science and Management Science}
%\course{Computer Science and Mathematics}
%\course{Computer Science and Physics}
%\course{Software Engineering}
%\course{Master of Informatics} % MInf students

% CHOOSE YOUR DEGREE b):
% please leave just one of the following un-commented
%\project{MInf Project (Part 1) Report}  % 4th year MInf students
%\project{MInf Project (Part 2) Report}  % 5th year MInf students
\project{4th Year Project Report}        % all other UG4 students


\date{\today}

\abstract{
    Advancements in network interface hardware and operating system capabilities sometimes render historically unpopular computer system architectures feasible. One unusual example is software DSM, which may potentially regain its relevance as a unique solution to hardware acceleration resource sharing beyond hypervisor-level allocation, feasible via exploiting existing features in the Linux kernel such as RDMA networking, \textit{heterogeneous memory management} alongside RDMA-capable network interfaces. However, building such a DSM system between compute nodes of different ISAs is yet more non-trivial. We particularly note that, unlike x86, many RISC ISAs (e.g., ARMv8, RISC-V) do not guarantee cache coherence between CPU and DMA engine on a hardware level. On the other hand, such systems define cache coherence operations on an instruction-level which nevertheless consumes CPU-time. To better advise DSM design with regards to such systems, we measure the latency of emulated non-hardware cache-coherent ARMv8 processor across a variety of scenarios, focusing solely on Linux's source code implementation and instrumentation mechanisms. We find that the latency of software-initiated cache coherency operations grows as the size of the address subspace to perform coherency operation on grows, though such relation exhibit as a non-linear correlation between contiguous allocation size and latency.
}

\maketitle

\newenvironment{ethics}
   {\begin{frontenv}{Research Ethics Approval}{\LARGE}}
   {\end{frontenv}\newpage}

\begin{ethics}
% \textbf{Instructions:} \emph{Agree with your supervisor which
% statement you need to include. Then delete the statement that you are not using,
% and the instructions in italics.\\
% \textbf{Either complete and include this statement:}}\\ % DELETE THESE INSTRUCTIONS
% %
% % IF ETHICS APPROVAL WAS REQUIRED:
% This project obtained approval from the Informatics Research Ethics committee.\\
% Ethics application number: ???\\
% Date when approval was obtained: YYYY-MM-DD\\
% %
% \emph{[If the project required human participants, edit as appropriate, otherwise delete:]}\\ % DELETE THIS LINE
% The participants' information sheet and a consent form are included in the appendix.\\
% %
% IF ETHICS APPROVAL WAS NOT REQUIRED:
% \textbf{\emph{Or include this statement:}}\\ % DELETE THIS LINE
This project was planned in accordance with the Informatics Research
Ethics policy. It did not involve any aspects that required approval
from the Informatics Research Ethics committee.

\standarddeclaration
\end{ethics}


\begin{acknowledgements}
I would like to acknowledge, first, the guidance and education I received from my supervisor \textit{Antonio Barbalace}, as well as his student, \textit{Amir Noohi}. Without them, this thesis could not come to fruition.

Secondly, I would like to acknowledge my mother. Much have happened around our lives but hopefully this means something, at least.

Third, the facsimile fox standing above the overhangs of a mirror shop (or perhaps a portrait frame shop) along the bus route I commute between school and my place of living. I simply find it neat.

Finally, I would like to extend my best wishes for the liberation of Palestine.
\end{acknowledgements}


\tableofcontents
\end{preliminary}

\chapter{Introduction}
Though large-scale cluster systems remain the dominant solution for request and data-level parallelism \cite{BOOK.Hennessy_Patterson.CArch.2011}, there have been a resurgence towards applying HPC techniques (e.g., DSM) for more efficient heterogeneous computation with tighter-coupled heterogeneous nodes providing (hardware) acceleration for one another \cites{Cabezas_etal.GPU-SM.2015}{Ma_etal.SHM_FPGA.2020}{Khawaja_etal.AmorphOS.2018}. Orthogonally, within one cluster node, \emph{heterogeneous memory management (HMM)} enables the use of OS-controlled, unified memory view across both main memory and device memory \cite{WEB.NVIDIA.Harris.Unified_Memory_CUDA.2017} -- all while using the same \textit{libc} function calls as one would with SMP programming, the underlying complexities of memory ownership and data placement is automatically managed by the OS kernel. However, while HMM promises a distributed shared memory approach towards exposing CPU and peripheral memory, applications (drivers and front-ends) that exploit HMM to provide ergonomic programming models remain fragmented and narrowly-focused. Existing efforts in exploiting HMM in Linux predominantly focus on exposing global address space abstraction to GPU memory -- a largely non-coordinated effort surrounding both \textit{in-tree} and proprietary code \cites{WEB.LWN.Corbet.HMM_GPL_woes.2018}{WEB.Phoronix..HMM_Search_Results.2023}. Limited effort have been done on incorporating HMM into other variants of accelerators in various system topologies.

Orthogonally, allocation of hardware accelerator resources in a cluster computing environment becomes difficult when the required hardware accelerator resources of one workload cannot be easily determined and/or isolated as a ``stage'' of computation. Within a cluster system there may exist a large amount of general-purpose worker nodes and limited amount of hardware-accelerated nodes. Further, it is possible that every workload performed on this cluster asks for hardware acceleration from time to time, but never for a relatively long time. Many job scheduling mechanisms within a cluster \emph{move data near computation} by migrating the entire job/container between general-purpose and accelerator nodes \cites{Rodriguez_etal.HPC_Cluster_Migration.2019} {Oh_Kim.Container_Migration.2018}. This way of migration naturally incurs large overhead -- accelerator nodes which strictly perform computation on data in memory without ever needing to touch the container's filesystem should not have to install the entire filesystem locally, for starters. Moreover, must \emph{all} computations be performed near data? \textit{Adrias}\cite{Masouros_etal.Adrias.2023}, for example, shows that RDMA over fast network interfaces (25 Gbps $\times$ 8), when compared to node-local setups, result in negligible impact on tail latencies but high impact on throughput when bandwidth is maximized.

This thesis paper hence builds upon an ongoing research effort in implementing an in-kernel DSM system on top of tightly coupled cluster where \textit{HMM} (\textit{Heterogeneous Memory Management}) abstractions allow for transparent RDMA access from accelerator nodes to local data and migration of data near computation. More specifically, this thesis explores the latency incurred by OS-initiated software cache coherency maintenance procedures common across all (R)DMA programs. The findings in this thesis is expected to inform the software coherence protocol and consistency model design of the in-kernel DSM system for accelerator-sharing purposes under a reusable, simple testing framework.

\chapter{Background}\label{chapter:background}
We introduce the following aspects pertaining to the in-kernel DSM project within this chapter:
\begin{itemize}
    \item {
        We identify and discuss notable developments in software-implemented DSM systems, and thus identify key features of contemporary advancements in DSM techniques that differentiate them from their predecessors.
    }
    \item {
        We identify alternative (shared memory) programming paradigms and compare them with DSM, which sought to provide transparent shared address space among participating nodes.
    }
    \item {
        We give an overview of coherency protocol and consistency models for multi-sharer DSM systems.
    }
    \item {
        We provide a primer to cache coherency in ARM64 systems, which \emph{do not} guarantee cache-coherent DMA, as opposed to x86 systems \cite{Ven.LKML_x86_DMA.2008}.
    }
\end{itemize}

\section{Experiences from Software DSM}
A majority of contributions to software DSM systems come from the 1990s \cites{Amza_etal.Treadmarks.1996}{Carter_Bennett_Zwaenepoel.Munin.1991}{Itzkovitz_Schuster_Shalev.Millipede.1998}{Hu_Shi_Tang.JIAJIA.1999}. These developments follow from the success of the Stanford DASH project in the late 1980s -- a hardware distributed shared memory (specifically NUMA) implementation of a multiprocessor that first proposed the \textit{directory-based protocol} for cache coherence, which stores the ownership information of cache lines to reduce unnecessary communication that prevented previous multiprocessors from scaling out \cite{Lenoski_etal.Stanford_DASH.1992}.

While developments in hardware DSM materialized into a universal approach to cache-coherence in contemporary many-core processors (e.g., \textit{Ampere Altra}\cite{WEB.Ampere..Ampere_Altra_Datasheet.2023}), software DSMs in clustered computing languished in favor of loosely-coupled nodes performing data-parallel computation, communicating via message-passing. Bandwidth limitations with the network interfaces of the late 1990s was insufficient to support the high traffic incurred by DSM and its programming model \cites{Werstein_Pethick_Huang.PerfAnalysis_DSM_MPI.2003}{Lu_etal.MPI_vs_DSM_over_cluster.1995}.

New developments in network interfaces provides much improved bandwidth and latency compared to ethernet in the 1990s. RDMA-capable NICs have been shown to improve the training efficiency sixfold compared to distributed \textit{TensorFlow} via RPC, scaling positively over non-distributed training \cite{Jia_etal.Tensorflow_over_RDMA.2018}. Similar results have been observed for \textit{APACHE Spark} \cite{Lu_etal.Spark_over_RDMA.2014} and \textit{SMBDirect} \cite{Li_etal.RelDB_RDMA.2016}. Consequently, there have been a resurgence of interest in software DSM systems and programming models \cites{Nelson_etal.Grappa_DSM.2015}{Cai_etal.Distributed_Memory_RDMA_Cached.2018}.

\subsection{Munin: Multi-Consistency Protocol}
\textit{Munin}\cite{Carter_Bennett_Zwaenepoel.Munin.1991} is one of the older developments in software DSM systems. The authors of Munin identify that \textit{false-sharing}, occurring due to multiple processors writing to different offsets of the same page triggering invalidations, is strongly detrimental to the performance of shared-memory systems. To combat this, Munin exposes annotations as part of its programming model to facilitate multiple consistency protocols on top of release consistency. An immutable shared memory object across readers, for example, can be safely copied without concern for coherence between processors. On the other hand, the \textit{write-shared} annotation explicates that a memory object is written by multiple processors without synchronization -- i.e., the programmer guarantees that only false-sharing occurs within this granularity. Annotations such as these explicitly disables subsets of consistency procedures to reduce communication in the network fabric, thereby improving the performance of the DSM system.

Perhaps most importantly, experiences from Munin show that \emph{restricting the flexibility of programming model can lead to more performant coherence models}, as exhibited by the now-foundational \textit{Resilient Distributed Database} paper \cite{Zaharia_etal.RDD.2012} which powered many now-popular scalable data processing frameworks such as \textit{Hadoop MapReduce} \cite{WEB.APACHE..Apache_Hadoop.2023} and \textit{APACHE Spark} \cite{WEB.APACHE..Apache_Spark.2023}. ``To achieve fault tolerance efficiently, RDDs provide a restricted form of shared memory [based on]\dots transformations rather than\dots updates to shared state'' \cite{Zaharia_etal.RDD.2012}. This allows for the use of transformation logs to cheaply synchronize states between unshared address spaces -- a much desired property for highly scalable, loosely-coupled clustered systems.

\subsection{Treadmarks: Multi-Writer Protocol}
\textit{Treadmarks}\cite{Amza_etal.Treadmarks.1996} is a software DSM system developed in 1996, which featured an intricate \textit{interval}-based multi-writer protocol that allows multiple nodes to write to the same page without false-sharing. The system follows a release-consistent memory model, which requires the use of either locks (via \texttt{acquire}, \texttt{release}) or barriers (via \texttt{barrier}) to synchronize. Each \textit{interval} represents a time period in-between page creation, \texttt{release} to another processor, or a \texttt{barrier}; they also each correspond to a \textit{write notice}, which are used for page invalidation. Each \texttt{acquire} message is sent to the statically-assigned lock-manager node, which forwards the message to the last releaser. The last releaser computes the outstanding write notices and piggy-backs them back for the acquirer to invalidate its own cached page entry, thus signifying entry into the critical section. Consistency information, including write notices, intervals, and page diffs, are routinely garbage-collected which forces cached pages in each node to become validated.

Compared to \textit{Treadmarks}, the system described in this paper uses a single-writer protocol, thus eliminating the concept of ``intervals'' -- with regards to synchronization, each page can be either in-sync (in which case they can be safely shared) or out-of-sync (in which case they must be invalidated/updated). This comes with the following advantage:

\begin{itemize}
    \item Less metadata for consistency-keeping.
    \item More adherent to the CPU-accelerator dichotomy model.
    \item Much simpler coherence protocol, which reduces communication cost.
\end{itemize}

In view of the (still) disparate throughput and latency differences between local and remote memory access \cite{Cai_etal.Distributed_Memory_RDMA_Cached.2018}, the simpler coherence protocol of single-writer protocol should provide better performance on the critical paths of remote memory access.

\subsection{Hotpot: Single-Writer \& Data Replication}
Newer works such as \textit{Hotpot}\cite{Shan_Tsai_Zhang.DSPM.2017} apply distributed shared memory techniques on persistent memory to provide ``transparent memory accesses, data persistence, data reliability, and high availability''. Leveraging on persistent memory devices allow DSM applications to bypass checkpoints to block device storage \cite{Shan_Tsai_Zhang.DSPM.2017}, ensuring both distributed cache coherence and data reliability at the same time \cite{Shan_Tsai_Zhang.DSPM.2017}.

We specifically discuss the single-writer portion of its coherence protocol. The data reliability guarantees proposed by the \textit{Hotpot} system requires each shared page to be replicated to some \textit{degree of replication}. Nodes who always store latest replication of shared pages are referred to as ``owner nodes'', which arbitrate other nodes to store more replications in order to reach the degree of replication quota. At acquisition time, the acquiring node asks the access-management node for single-writer access to shared page, who grants it if no other critical section exists, alongside list of current owner nodes. At release time, the releaser first commits its changes to all owner nodes which, in turn, commits its received changes across lesser sharers to achieve the required degree of replication. These two operations are all acknowledged back in reverse order. Once all acknowledgements are received from owner nodes by commit node, the releaser tells them to delete their commit logs and, finally, tells the manager node to exit critical section.

The required degree of replication and logged commit transaction until explicit deletion facilitate crash recovery at the expense of worse performance over release-time I/O. While the study of crash recovery with respect to shared memory systems is out of the scope of this thesis, this paper provides a good framework for a \textbf{correct} coherence protocol for a single-writer, multiple-reader shared memory system, particularly when the protocol needs to cater for a great variety of nodes each with their own memory preferences (e.g., write-update vs. write-invalidate, prefetching, etc.).

\subsection{MENPS: A Return to DSM}
MENPS\cite{Endo_Sato_Taura.MENPS_DSM.2020} leverages new RDMA-capable interconnects as a proof-of-concept that DSM systems and programming models can be as efficient as \textit{partitioned global address space} (PGAS) using today's network interfaces. It builds upon \textit{TreadMark}'s \cite{Amza_etal.Treadmarks.1996} coherence protocol and crucially alters it to a \textit{floating home-based} protocol, based on the insight that diff-transfers across the network is comparatively costly compared to RDMA intrinsics -- which implies preference towards local diff-merging. The home node then acts as the data supplier for every shared page within the system.

Compared to PGAS frameworks (e.g., MPI), experimentation over a subset of \textit{NAS Parallel Benchmarks} shows that MENPS can obtain comparable speedup in some of the computation tasks, while achieving much better productivity due to DSM's support for transparent caching, etc. \cite{Endo_Sato_Taura.MENPS_DSM.2020}. These results back up their claim that DSM systems are at least as viable as traditional PGAS/message-passing frameworks for scientific computing, also corroborated by the resurgence of DSM studies later on\cite{Masouros_etal.Adrias.2023}.

\section{Alternatives to DSM}
While the feasibility of transparent DSM systems over multiple machines on the network has been made apparent since the 1980s, predominant approaches to ``scaling-out'' programs over the network relies on the message-passing approach \cite{AST_Steen.Distributed_Systems-3ed.2017}. The reasons are twofold:

\begin{enumerate}
    \item {
        Programmers would rather resort to more intricate, more predictable approaches to scaling-out programs over the network \cite{AST_Steen.Distributed_Systems-3ed.2017}. This implies manual/controlled data sharding over nodes, separation of compute and communication ``stages'' of computation, etc., which benefit performance analysis and engineering.
    }
    \item {
        Enterprise applications value throughput and uptime of relatively computationally inexpensive tasks/resources \cite{BOOK.Hennessy_Patterson.CArch.2011}, which requires easy scalability of tried-and-true, latency-inexpensive applications. Studies in transparent DSM systems mostly require exotic, specifically-written programs to exploit global address space, which is fundamentally at odds in terms of reusability and flexibility required.
    }
\end{enumerate}

\subsection{PGAS}
\textit{Partitioned Global Address Space} (PGAS) is a parallel programming model that (1) exposes a global address space to all machines within a network and (2) explicates distinction between local and remote memory \cite{De_Wael_etal.PGAS_Survey.2015}. Oftentimes, message-passing frameworks, for example \textit{OpenMPI}, \textit{OpenFabrics}, and \textit{UCX}, are used as backends to provide the PGAS model over various network interfaces/platforms (e.g., Ethernet and Infiniband)\cites{WEB.LBNL.UPC_man_1_upcc.2022} {WEB.HPE.Chapel_Platforms-v1.33.2023}.

Notably, implementation of a \emph{global} address space across machines on top of machines already equipped with their own \emph{local} address space (e.g., cluster nodes running commercial Linux) necessitates a global addressing mechanism for shared/shared data objects. DART\cite{Zhou_etal.DART-MPI.2014}, for example, utilizes a 128-bit ``global pointer'' to encode global memory object/segment ID and access flags in the upper 64 bits and virtual addresses in the lower 64 bits for each (slice of) memory object allocated within the PGAS model. A \textit{non-collective} PGAS object is allocated entirely local to the allocating node's memory, but registered globally. Consequently, a single global pointer is recorded in the runtime with corresponding permission flags for the context of some user-defined group of associated nodes. Comparatively, a \textit{collective} PGAS object is allocated such that a partition of the object (i.e., a sub-array of the repr) is stored in each of the associated node -- for a $k$-partitioned object, $k$ global pointers are recorded in the runtime each pointing to the same object, with different offsets and (intuitively) independently-chosen virtual addresses. Note that this design naturally requires virtual addresses within each node to be \emph{pinned} -- the allocated object cannot be re-addressed to a different virtual address, thus preventing the global pointer that records the local virtual address from becoming spontaneously invalidated.

Similar schemes can be observed in other PGAS backends/runtimes, albeit they may opt to use a map-like data structure for addressing instead. In general, despite both PGAS and DSM systems provide memory management over remote nodes, PGAS frameworks provide no transparent caching and transfer of remote memory objects accessed by local nodes. The programmer is still expected to handle data/thread movement manually when working with shared memory over network to maximize their performance metrics of interest.

\subsection{Message Passing}
\label{sec:msg-passing}
\textit{Message Passing} remains the predominant programming model for parallelism between loosely-coupled nodes within a computer system, much as it is ubiquitous in supporting all levels of abstraction within any concurrent components of a computer system. Specific to cluster computing systems is the message-passing programming model, where parallel programs (or instances of the same parallel program) on different nodes within the system communicate via exchanging messages over network between these nodes. Such models exchange programming model productivity for more fine-grained control over the messages passed, as well as more explicit separation between communication and computation stages within a programming subproblem.

Commonly, message-passing backends function as \textit{middlewares} -- communication runtimes --  to aid distributed software development \cite{AST_Steen.Distributed_Systems-3ed.2017}. Such a message-passing backend expose facilities for inter-application communication to frontend developers while transparently providing security, accounting, and fault-tolerance, much like how an operating system may provide resource management, scheduling, and security to traditional applications \cite{AST_Steen.Distributed_Systems-3ed.2017}. This is the case for implementing the PGAS programming model, which mostly rely on common message-passing backends to facilitate orchestrated data manipulation across distributed nodes. Likewise, message-passing backends, including RDMA API, form the backbone of many research-oriented DSM systems \cites{Endo_Sato_Taura.MENPS_DSM.2020}{Hong_etal.NUMA-to-RDMA-DSM.2019} {Cai_etal.Distributed_Memory_RDMA_Cached.2018}{Kaxiras_etal.DSM-Argos.2015}.

Message-passing between network-connected nodes may be \textit{two-sided} or \textit{one-sided}. The former models an intuitive workflow to sending and receiving datagrams over the network -- the sender initiates a transfer; the receiver copies a received packet from the network card into a kernel buffer; the receiver's kernel filters the packet and (optionally) \cite{FreeBSD.man-BPF-4.2021} copies the internal message into the message-passing runtime/middleware's address space; the receiver's middleware inspects the copied message and performs some procedures accordingly, likely also involving copying slices of message data to some registered distributed shared memory buffer for the distributed application to access. Despite it being a highly intuitive model of data manipulation over the network, this poses a fundamental performance issue: because the process requires the receiver's kernel AND userspace to exert CPU-time, upon reception of each message, the receiver node needs to proactively exert CPU-time to move the received data from bytes read from NIC devices to userspace. Because this happens concurrently with other kernel and userspace routines in a concurrent system, a preemptable kernel may incur significant latency if the kernel routine for packet filtering is pre-empted by another kernel routine, userspace, or IRQs.

Comparatively, a ``one-sided'' message-passing scheme, for example RDMA, allows the network interface card to bypass in-kernel packet filters and perform DMA on registered memory regions. The NIC can hence notify the CPU via interrupts, thus allowing the kernel and the userspace programs to perform callbacks at reception time with reduced latency. Because of this advantage, many recent studies attempt to leverage RDMA APIs for improved distributed data workloads and creating DSM middlewares \cites{Lu_etal.Spark_over_RDMA.2014} {Jia_etal.Tensorflow_over_RDMA.2018}{Endo_Sato_Taura.MENPS_DSM.2020} {Hong_etal.NUMA-to-RDMA-DSM.2019}{Cai_etal.Distributed_Memory_RDMA_Cached.2018} {Kaxiras_etal.DSM-Argos.2015}.

\section{Consistency Model and Cache Coherence}
Consistency model specifies a contract on allowed behaviors of multi-processing programs with regards to a shared memory \cite{Nagarajan_etal.Primer_consistency_coherence_arch.2ed.2020}. One obvious conflict, which consistency models aim to resolve, lies within the interaction between processor-native programs and multi-processors, all of whom needs to operate on a shared memory with heterogeneous cache topologies. Here, a well-defined consistency model aims to resolve the conflict on an architectural scope. Beyond consistency models for bare-metal systems, programming languages \cites{ISO/IEC_9899:2011.C11}{ISO/IEC_JTC1_SC22_WG21_N2427.C++11.2007} {Manson_Goetz.JSR_133.Java_5.2004}{Rust.core::sync::atomic::Ordering.2024} and paradigms \cites{Amza_etal.Treadmarks.1996}{Hong_etal.NUMA-to-RDMA-DSM.2019} {Cai_etal.Distributed_Memory_RDMA_Cached.2018} define consistency models for parallel access to shared memory on top of program order guarantees to explicate program behavior under shared memory parallel programming across underlying implementations.

Related to the definition of a consistency model is the coherence problem, which arises whenever multiple actors have access to multiple copies of some datum, which needs to be synchronized across multiple actors with regards to write-accesses \cite{Nagarajan_etal.Primer_consistency_coherence_arch.2ed.2020}. While less relevant to programming language design, coherence must be maintained via a coherence protocol \cite{Nagarajan_etal.Primer_consistency_coherence_arch.2ed.2020} in systems of both microarchitectural and network scales. For DSM systems, the design of a correct and performant coherence protocol is of especially high priority and is a major part of many studies in DSM systems throughout history \cites{Carter_Bennett_Zwaenepoel.Munin.1991}{Amza_etal.Treadmarks.1996} {Pinto_etal.Thymesisflow.2020}{Endo_Sato_Taura.MENPS_DSM.2020} {Couceiro_etal.D2STM.2009}.

\subsection{Consistency Model in DSM}
Distributed shared memory systems with node-local caching naturally implies the existence of the consistency problem with regards to contending read/write accesses. Indeed, a significant subset of DSM studies explicitly characterize themselves as adhering to one of the well-known consistency models to better understand system behavior and to provide optimizations in coherence protocols \cites{Amza_etal.Treadmarks.1996}{Hu_Shi_Tang.JIAJIA.1999} {Carter_Bennett_Zwaenepoel.Munin.1991}{Endo_Sato_Taura.MENPS_DSM.2020} {Wang_etal.Concordia.2021}{Cai_etal.Distributed_Memory_RDMA_Cached.2018} {Kim_etal.DeX-upon-Linux.2020}, each adhering to a different consistency model to balance between communication costs and ease of programming.

In particular, we note that DSM studies tend to conform to either release consistency \cites{Amza_etal.Treadmarks.1996}{Endo_Sato_Taura.MENPS_DSM.2020} {Carter_Bennett_Zwaenepoel.Munin.1991} or weaker \cite{Hu_Shi_Tang.JIAJIA.1999}, or sequential consistency \cites{Chaiken_Kubiatowicz_Agarwal.LimitLESS-with-Alewife.1991} {Wang_etal.Concordia.2021}{Kim_etal.DeX-upon-Linux.2020}{Ding.vDSM.2018}, with few works \cite{Cai_etal.Distributed_Memory_RDMA_Cached.2018} pertaining to moderately constrained consistency models in-between. While older works, as well as works which center performance of their proposed DSM systems over existing approaches \cites{Endo_Sato_Taura.MENPS_DSM.2020} {Cai_etal.Distributed_Memory_RDMA_Cached.2018}, favor release consistency due to its performance benefits (e.g., in terms of coherence costs \cite{Endo_Sato_Taura.MENPS_DSM.2020}), newer works tend to adopt stricter consistency models, sometimes due to improved productivity offered to programmers \cite{Kim_etal.DeX-upon-Linux.2020}.

\begin{table}[h]
    \centering
    \begin{tabular}{|l|c c c c c c|}
        \hline
        % ...
            & Sequential
            & TSO
            & PSO
            & Release
            & Acquire
            & Scope \\
        \hline
        Home; Invalidate
            & \cites{Kim_etal.DeX-upon-Linux.2020}{Ding.vDSM.2018}{Zhang_etal.GiantVM.2020}
            &
            &
            & \cites{Shan_Tsai_Zhang.DSPM.2017}{Endo_Sato_Taura.MENPS_DSM.2020}
            & \cites{Holsapple.DSM64.2012}
            & \cites{Hu_Shi_Tang.JIAJIA.1999} \\
        \hline
        Home; Update
            & & & & & & \\
        \hline
        Float; Invalidate
            &
            &
            &
            & \cites{Endo_Sato_Taura.MENPS_DSM.2020}
            &
            & \\
        \hline
        Float; Update
            & & & & & & \\
        \hline
        Directory; Inval.
            & \cites{Wang_etal.Concordia.2021}
            &
            &
            &
            &
            & \\
        \hline
        Directory; Update
            & & & & & & \\
        \hline
        Dist. Dir.; Inval.
            & \cites{Chaiken_Kubiatowicz_Agarwal.LimitLESS-with-Alewife.1991}
            &
            & \cites{Cai_etal.Distributed_Memory_RDMA_Cached.2018}
            & \cites{Carter_Bennett_Zwaenepoel.Munin.1991}
            & \cites{Carter_Bennett_Zwaenepoel.Munin.1991}{Amza_etal.Treadmarks.1996}
            & \\
        \hline
        Dist. Dir.; Update
            &
            &
            &
            & \cites{Carter_Bennett_Zwaenepoel.Munin.1991}
            &
            & \\
        \hline
    \end{tabular}
    \caption{
        Coherence Protocol vs. Consistency Model in Selected Disaggregated Memory Studies. ``Float'' short for ``floating home''. Studies selected for clearly described consistency model and coherence protocol.
    }
    \label{table:consistency-vs-coherency}
\end{table}

We especially note the role of balancing productivity and performance in terms of selecting the ideal consistency model for a system. It is common knowledge that weaker consistency models are harder to program with, at the benefit of less (implied) coherence communications resulting in better throughput overall -- provided that the programmer could guarantee correctness, a weaker consistency model allows for less invalidation of node-local cache entries, thereby allowing multiple nodes to compute in parallel on (likely) outdated local copy of data such that the result of the computation remains semantically correct with regards to the program. This point was made explicit in \textit{Munin} \cite{Carter_Bennett_Zwaenepoel.Munin.1991}, where (to reiterate) it introduces the concept of consistency ``protocol parameters'' to annotate shared memory access pattern, in order to reduce the amount of coherence communications necessary between nodes computing in distributed shared memory. For example, a DSM object (memory object accounted for by the DSM system) can be annotated with ``delayed operations'' to delay coherence operations beyond any write-access, or shared without ``write'' annotation to disable write-access over shared nodes, thereby disabling all coherence operations with regards to this DSM object. Via programmer annotation of DSM objects, the Munin DSM system explicates the effect of weaker consistency in relation to the amount of synchronization overhead necessary among shared memory nodes. To our knowledge, no other more recent DSM works have explored this interaction between consistency and coherence costs on DSM objects, though relatedly \textit{Resilient Distributed Dataset (RDD)} \cite{Zaharia_etal.RDD.2012} also highlights its performance and flexibility benefits in opting for an immutable data representation over disaggregated memory over network when compared to contemporary DSM approaches.

\subsection{Coherence Protocol}
Coherence protocols hence becomes the means over which DSM systems implement their consistency model guarantees. As table \ref{table:consistency-vs-coherency} shows, DSM studies tends to implement write-invalidated coherence under a \textit{home-based} or \textit{directory-based} protocol framework, while a subset of DSM studies sought to reduce communication overheads and/or improve data persistence by offering write-update protocol extensions \cites{Carter_Bennett_Zwaenepoel.Munin.1991}{Shan_Tsai_Zhang.DSPM.2017}.

\subsubsection{Home-Based Protocols}
\textit{Home-based} protocols define each shared memory object with a corresponding ``home'' node, under the assumption that a many-node network would distribute home-node ownership of shared memory objects across all hosts \cite{Hu_Shi_Tang.JIAJIA.1999}. On top of home-node ownership, each mutable shared memory object may be additionally cached by other nodes within the network, creating the coherence problem. To our knowledge, in addition to table \ref{table:consistency-vs-coherency}, this protocol and its derivatives had been adopted by \cites{Fleisch_Popek.Mirage.1989}{Schaefer_Li.Shiva.1989}{Hu_Shi_Tang.JIAJIA.1999}{Nelson_etal.Grappa_DSM.2015}{Shan_Tsai_Zhang.DSPM.2017}{Endo_Sato_Taura.MENPS_DSM.2020}.

We identify that home-based protocols are conceptually straightforward compared to directory-based protocols, centering communications over storage of global metadata (in this case ownership of each shared memory object). This leads to greater flexibility in implementing coherence protocols. A shared memory object at its creation may be made known globally via broadcast, or made known to only a subset of nodes (0 or more) via multicast. Likewise, metadata storage could be cached locally to each node and invalidated alongside object invalidation or fetched from a fixed node with respect to one object. This implementation flexibility is further taken advantage of in \textit{Hotpot}\cite{Shan_Tsai_Zhang.DSPM.2017}, which refines the ``home node'' concept into \textit{owner node} to provide replication and persistence, in addition to adopting a dynamic home protocol similar to that of \cite{Endo_Sato_Taura.MENPS_DSM.2020}.

\subsubsection{Directory-Based Protocols}
\textit{Directory-based} protocols instead take a shared database approach by denoting each shared memory object with a globally shared entry describing ownership and sharing status. In its non-distributed form (e.g., \cite{Wang_etal.Concordia.2021}), a global, central directory is maintained for all nodes in network for ownership information: the directory hence becomes a bottleneck for imposing latency and bandwidth constraints on parallel processing systems. Comparatively, a distributed directory scheme may delegate responsibilities across all nodes in network mostly in accordance to sharded address space \cites{Hong_etal.NUMA-to-RDMA-DSM.2019}{Cai_etal.Distributed_Memory_RDMA_Cached.2018}. Though theoretically sound, this scheme performs no dynamic load-balancing for commonly shared memory objects, which in the worst case would function exactly like a non-distributed directory coherence scheme. To our knowledge, in addition to table \ref{table:consistency-vs-coherency}, this protocol and its derivatives had been adopted by \cites{Carter_Bennett_Zwaenepoel.Munin.1991}{Amza_etal.Treadmarks.1996}{Schoinas_etal.Sirocco.1998}{Eisley_Peh_Shang.In-net-coherence.2006}{Hong_etal.NUMA-to-RDMA-DSM.2019}.

\subsection{DMA and Cache Coherence}
The advent of high-speed RDMA-capable network interfaces introduce introduce opportunities for designing more performant DSM systems over RDMA (as established in \ref{sec:msg-passing}). Orthogonally, RDMA-capable NICs on a fundamental level perform direct memory access over the main memory to achieve one-sided RDMA operations to reduce the effect of OS jittering on RDMA latencies. For modern computer systems with cached multiprocessors, this poses a potential cache coherence problem on a local level -- because RDMA operations happen concurrently with regards to memory accesses by CPUs, which stores copies of memory data in cache lines which may \cites{Kjos_etal.HP-HW-CC-IO.1996}{Ven.LKML_x86_DMA.2008} or may not \cites{Giri_Mantovani_Carloni.NoC-CC-over-SoC.2018}{Corbet.LWN-NC-DMA.2021} be fully coherent by the DMA mechanism, any DMA operations performed by the RDMA NIC may be incoherent with the cached copy of the same data inside the CPU caches (as is the case for accelerators, etc.). This issue is of particular concern to the kernel development community, who needs to ensure that the behaviors of DMA operations remain identical across architectures regardless of support of cache-coherent DMA \cite{Corbet.LWN-NC-DMA.2021}. Likewise existing RDMA implementations which make heavy use of architecture-specific DMA memory allocation implementations, implementing RDMA-based DSM systems in kernel also requires careful use of kernel API functions that ensure cache coherency as necessary.

\subsection{Cache Coherence in ARMv8-A}
We specifically focus on the implementation of cache coherence in ARMv8-A. Unlike x86 which guarantees cache-coherent DMA \cites{Ven.LKML_x86_DMA.2008}{Corbet.LWN-NC-DMA.2021}, the ARMv8-A architecture (and many other popular ISAs, for example \textit{RISC-V}) \emph{does not} guarantee cache-coherency of DMA operations across vendor implementations. ARMv8 defines a hierarchical model for coherency organization to support \textit{heterogeneous} and \textit{asymmetric} multi-processing systems \cite{ARM.ARMv8-A.v1.0.2015}.

\begin{definition}[cluster]
    A \textit{cluster} defines a minimal cache-coherent region for Cortex-A53 and Cortex-A57 processors. Each cluster usually comprises of 1 or more core as well as a shared last-level cache.
\end{definition}

\begin{definition}[sharable domain]
    A \textit{sharable domain} defines a vendor-defined cache-coherent region. Sharable domains can be \textit{inner} or \textit{outer}, which limits the scope of broadcast coherence messages to \textit{point-of-unification} and \textit{point-of-coherence}, respectively.

    Usually, the \textit{inner} sharable domain defines the domain of all (closely-coupled) processors inside a heterogeneous multiprocessing system (see \ref{def:het-mp}); while the \textit{outer} sharable domain defines the largest memory-sharing domain for the system (e.g. inclusive of DMA bus).
\end{definition}

\begin{definition}[Point-of-Unification]\label{def:pou}
    The \textit{point-of-unification} (\textit{PoU}) under ARMv8 defines a level of coherency such that all sharers inside the \textbf{inner} sharable domain see the same copy of data.

    Consequently, \textit{PoU} defines a point at which every core of a ARMv8-A processor sees the same (i.e., a \emph{unified}) copy of a memory location regardless of accessing via instruction caches, data caches, or TLB.
\end{definition}

\begin{definition}[Point-of-Coherence]\label{def:poc}
    The \textit{point-of-coherence} (\textit{PoC}) under ARMv8 defines a level of coherency such that all sharers inside the \textbf{outer} sharable domain see the same copy of data.

    Consequently, \textit{PoC} defines a point at which all \textit{observers} (e.g., cores, DSPs, DMA engines) to memory will observe the same copy of a memory location.
\end{definition}

\subsubsection{Addendum: \textit{Heterogeneous} \& \textit{Asymmetric} Multiprocessing}
Using these definitions, a vendor could build \textit{heterogeneous} and \textit{asymmetric} multi-processing systems as follows:
\begin{definition}[Heterogeneous Multiprocessing]\label{def:het-mp}
    A \textit{heterogeneous multiprocessing} system incorporates ARMv8 processors of diverse microarchitectures that are fully coherent with one another, running the same system image.
\end{definition}

\begin{definition}[Asymmetric Multiprocessing]
    A \textit{asymmetric multiprocessing} system needs not contain fully coherent processors. For example, a system-on-a-chip may contain a non-coherent co-processor for secure computing purposes \cite{ARM.ARMv8-A.v1.0.2015}.
\end{definition}

\subsection{ARMv8-A Software Cache Coherence in Linux Kernel}
\label{subsec:armv8a-swcoherency}
Because of the lack of hardware guarantee on hardware DMA coherency (though such support exists \cite{Parris.AMBA_4_ACE-Lite.2013}), programmers need to invoke architecture-specific cache-coherency instructions when porting DMA hardware support over a diverse range of ARMv8 microarchitectures, often encapsulated in problem-specific subroutines.

Notably, kernel (driver) programming warrants programmer attention to software-maintained coherency when userspace programmers downstream expect data-flow, interspersed between CPU and DMA operations, to follow program ordering and (driver vendor) specifications. One such example arises in the Linux kernel implementation of DMA memory management API \cite{Miller_Henderson_Jelinek.Kernelv6.7-DMA_guide.2024}\footnote[1]{Based on Linux kernel v6.7.0.}:

\begin{definition}[DMA Mappings]
    The Linux kernel DMA memory allocation API, imported via
    \begin{minted}[linenos, bgcolor=code-bg]{c}
#include <linux/dma-mapping.h>
    \end{minted}
    defines two variants of DMA mappings:

    \begin{itemize}
        \item {\label{def:consistent-dma-map}
            \textit{Consistent} DMA mappings:

            They are guaranteed to be coherent in-between concurrent CPU/DMA accesses without explicit software flushing.
            \footnote[2]{
                However, it does not preclude CPU store reordering, so memory barriers remain necessary in a multiprocessing context.
            }
        }
        \item {\label{def:streaming-dma-map}
            \textit{Streaming} DMA mappings:

            They provide no guarantee to coherency in-between concurrent CPU/DMA accesses. Programmers need to manually apply coherency maintenance subroutines for synchronization.
        }
    \end{itemize}
\end{definition}

Consistent DMA mappings could be trivially created via allocating non-cacheable memory, which guarantees \textit{PoC} for all memory observers (though system-specific fastpaths exist).

On the other hand, streaming DMA mappings require manual synchronization upon programmed CPU/DMA access. Take single-buffer synchronization on CPU after DMA access for example:
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
/* In kernel/dma/mapping.c $\label{code:dma_sync_single_for_cpu}$*/
void dma_sync_single_for_cpu(
    struct device *dev,          // kernel repr for DMA device
    dma_addr_t addr,             // DMA address
    size_t size,                 // Synchronization buffer size
    enum dma_data_direction dir, // Data-flow direction -- see $\ref{appendix:enum_dma_data_direction}$
) {
    /* Translate DMA address to physical address */
    phys_addr_t paddr = dma_to_phys(dev, addr);

    if (!dev_is_dma_coherent(dev)) {
        arch_sync_dma_for_cpu(paddr, size, dir);
        arch_sync_dma_for_cpu_all(); // MIPS quirks, nop for ARM64
    }

    /* Miscellaneous cases...*/
}
\end{minted}

\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
/* In arch/arm64/mm/dma-mapping.c */
void arch_sync_dma_for_cpu(
    phys_addr_t paddr,
    size_t size,
    enum dma_data_direction dir,
) {
    /* Translate physical address to (kernel) virtual address */
    unsigned long start = (unsigned long)phys_to_virt(paddr);

    /* Early exit for DMA read: no action needed for CPU */
    if (dir == DMA_TO_DEVICE)
        return;

    /* ARM64-specific: invalidate CPU cache to PoC */
    dcache_inval_poc(start, start + size);
}
\end{minted}

This call-chain, as well as its mirror case which maintains cache coherency for the DMA device after CPU access: \mint[breaklines=true]{c}|dma_sync_single_for_device(struct device *, dma_addr_t, size_t, enum dma_data_direction)|, call into the following procedures, respectively:

\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
/* Exported @ arch/arm64/include/asm/cacheflush.h */
/* Defined @ arch/arm64/mm/cache.S */
/* All functions accept virtual start, end addresses. */

/* Invalidate data cache region [start, end) to PoC.
 *
 * Invalidate CPU cache entries that intersect with [start, end),
 * such that data from external writers becomes visible to CPU.
 */
extern void dcache_inval_poc(
    unsigned long start, unsigned long end
);

/* Clean data cache region [start, end) to PoC. $\label{code:dcache_clean_poc}$
 *
 * Write-back CPU cache entries that intersect with [start, end),
 * such that data from CPU becomes visible to external writers.
 */
extern void dcache_clean_poc(
    unsigned long start, unsigned long end
);
\end{minted}

\subsubsection{Use-case: Kernel-space \textit{SMBDirect} Driver}
An example of cache-coherent in-kernel RDMA networking module over heterogeneous ISAs could be found in the Linux implementation of \textit{SMBDirect}. \textit{SMBDirect} is an extension of the \textit{SMB} (\textit{Server Message Block}) protocol for opportunistically establishing the communication protocol over RDMA-capable network interfaces \cite{many.MSFTLearn-SMBDirect.2024}.

We focus on two procedures inside the in-kernel SMBDirect implementation:

\paragraph*{Before send: \texttt{smbd\_post\_send}}
\texttt{smbd\_post\_send} is a function downstream of the call-chain of \texttt{smbd\_send}, which sends SMBDirect payload for transport over network. Payloads are constructed and batched for maximized bandwidth, then \texttt{smbd\_send} calls \texttt{smbd\_post\_send} to signal the RDMA NIC for transport.

The function body is roughly as follows:
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
/* In fs/smb/client/smbdirect.c */
static int smbd_post_send(
    struct smbd_connection *info, // SMBDirect transport context
    struct smbd_request *request, // SMBDirect request context
) {
    struct ib_send_wr send_wr; // Ib "Write Request" for payload
    int rc, i;

    /* For each message in batched payload */
    for (i = 0; i < request->num_sge; i++) {
        /* Log to kmesg ring buffer... */

        /* RDMA wrapper over DMA API$\ref{code:dma_sync_single_for_cpu}$ $\label{code:ib_dma_sync_single_for_device}$*/
        ib_dma_sync_single_for_device(
            info->id->device,       // struct ib_device *
            request->sge[i].addr,   // u64 (as dma_addr_t)
            request->sge[i].length, // size_t
            DMA_TO_DEVICE,          // enum dma_data_direction
        );
    }

    /* Populate `request`, `send_wr`... */

    rc = ib_post_send(
        info->id->qp, // struct ib_qp * ("Queue Pair")
        &send_wr,     // const struct ib_recv_wr *
        NULL,         // const struct ib_recv_wr ** (err handling)
    );

    /* Error handling... */

    return rc;
}
\end{minted}

Line \ref{code:ib_dma_sync_single_for_device} writes back CPU cache lines to be visible for RDMA NIC in preparation for DMA operations when the posted \textit{send request} is worked upon.

\paragraph*{Upon reception: \texttt{recv\_done}}
\texttt{recv\_done} is called when the RDMA subsystem works on the received payload over RDMA.

Mirroring the case for \texttt{smbd\_post\_send}, it invalidates CPU cache lines for DMA-ed data to be visible at CPU cores prior to any operations on received data:

\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
/* In fs/smb/client/smbdirect.c */
static void recv_done(
    struct ib_cq *cq, // "Completion Queue"
    struct ib_wc *wc, // "Work Completion"
) {
    struct smbd_data_transfer *data_transfer;
    struct smbd_response *response = container_of(
        wc->wr_cqe,           // ptr: pointer to member
        struct smbd_response, // type: type of container struct
        cqe,                  // name: name of member in struct
    ); // Cast member of struct into containing struct (C magic)
    struct smbd_connection *info = response->info;
    int data_length = 0;

    /* Logging, error handling... */

    /* Likewise, RDMA wrapper over DMA API$\ref{code:dma_sync_single_for_cpu}$ */
    ib_dma_sync_single_for_cpu(
        wc->qp->device,
        response->sge.addr,
        response->sge.length,
        DMA_FROM_DEVICE,
    );

    /* ... */
}
\end{minted}

\chapter{Software Coherency Latency} \label{chapter:sw-coherency}
Coherency must be maintained at software level when hardware cache coherency cannot be guaranteed for some specific ISA (as established in subsection \ref{subsec:armv8a-swcoherency}). There is, therefore, interest in knowing the latency of coherence-maintenance operations for performance engineering purposes, for example OS jitter analysis for scientific computing in heterogeneous clusters and, more pertinently, comparative analysis between software and hardware-backed DSM systems (e.g. \cites{Masouros_etal.Adrias.2023}{Wang_etal.Concordia.2021}). Such an analysis is crucial to being well-informed when designing a cross-architectural DSM system over RDMA.

The purpose of this chapter is hence to provide a statistical analysis over software coherency latency in ARM64 systems by instrumenting hypothetical scenarios of software-initiated coherence maintenance in ARM64 test-benches.

The rest of the chapter is structured as follows:
\begin{itemize}
    \item {
        \hyperref[sec:sw-coherency-setup]{\textbf{Experiment Setup}} covers the test-benches used for instrumentation, including the kernel version, distribution, and the specifications of the instrumented (bare-metal/virtual) machine.
    }
    \item {
        \hyperref[sec:sw-coherency-method]{\textbf{Methodology}} covers the kernel module and workload used for instrumentation and experimentation, including changes made to the kernel, the kernel module, and userspace programs used for experimentation.
    }
    \item {
        \hyperref[sec:sw-coherency-results]{\textbf{Results}} covers the results gathered during instrumentation from various test-benches, segmented by experiment.
    }
    \item {
        \hyperref[sec:sw-coherency-discuss]{\textbf{Discussion}} identifies key insights from experimental results, as well as deficiencies in research method and possible directions of future works.
    }
\end{itemize}

\section{Experiment Setup}\label{sec:sw-coherency-setup}
\subsection{QEMU-over-x86: \texttt{star}}\label{subsec:spec-star}
The primary source of experimental data come from a virtualized machine: a virtualized guest running a lightly-modified Linux v6.7.0 preemptive kernel with standard non-graphical Debian 12 distribution installed to provide userspace support. Table \ref{table:star} describes the specifics of the QEMU-emulated ARM64 test-bench, while table \ref{table:starhost} describes the specifics of its host.

\begin{table}[h]
    \centering
    \begin{tabular}{|c|c|}
        \hline
        Processors & QEMU virt-8.2 (3 $\times$ 2-way SMT; emulates Cortex-A76) \\
        \hline
        Frequency & 2.0 GHz (\textit{sic.}\footnotemark[3]) \\
        \hline
        CPU Flags &
        \begin{tabular}{@{}cccccc@{}}
            % 1         2       3       4           5           6
            fp       & asimd & evtstrm & aes     & pmull   & sha1  \\
            sha2     & crc32 & atomics & fphp    & asimdhp & cpuid \\
            asimdrdm & lrcpc & dcpop   & asimddp &         &       \\
        \end{tabular} \\
        \hline
        NUMA Topology & 1: $\{P_0,\ \dots,\ P_5\}$ \\
        \hline
        Memory & 1: 4GiB \\
        \hline
        Kernel & Linux 6.7.0 (modified) SMP Preemptive \\
        \hline
        Distribution & Debian 12 (bookworm) \\
        \hline
    \end{tabular}
    \caption{Specification of \texttt{star}}
    \label{table:star}
\end{table}

\footnotetext[3]{As reported from \texttt{lscpu}. Likely not reflective of actual emulation performance.}

\begin{table}[h]
    \centering
    \begin{tabular}{|c|c|}
        \hline
        Processors & AMD Ryzen 7 4800HS (8 $\times$ 2-way SMT) \\
        \hline
        Frequency & 2.9 GHz (4.2 GHz Turbo) \\
        \hline
        NUMA Topology & 1: $\{P_0,\ \dots,\ P_{15}\}$ \\
        \hline
        Cache Structure &
        \begin{tabular}{@{}c|c@{}}
            L3 & $P_0 \dots P_7$: 4MiB, $P_8 \dots P_{15}$: 4MiB \\
            L2 & Per core\footnotemark[4]: 512KiB \\
            L1 & Per core: d-cache 32KiB, i-cache 32KiB \\
        \end{tabular} \\
        \hline
        Memory & 1: 40 GiB DDR4-3200 SO-DIMM \\
        \hline
        Filesystem & ext4 on Samsung SSD 970 EVO Plus \\
        \hline
        Kernel & Linux 6.7.9 (arch1-1) SMP Preemptive \\
        \hline
        Distribution & Arch Linux \\
        \hline
    \end{tabular}
    \caption{Specification of Host}
    \label{table:starhost}
\end{table}

\footnotetext[4]{i.e., per 2 threads. For example: $P_0$, $P_1$ comprises one core.}

\subsection{\textit{Ampere Altra}: \texttt{rose}}\label{subsec:spec-rose}
\begin{table}[H] % suboptimal, but otherwise gets placed in next sec...
    \centering
    \begin{tabular}{|c|c|}
        \hline
        Processors & Ampere Altra (32 core; Neoverse N1 microarch.) \\
        \hline
        Frequency & 1.7 GHz (3.0 GHz max) \\
        \hline
        NUMA Topology & 1: $\{P_0,\ \dots,\ P_{31}\}$ \\
        \hline
        Cache Structure &
        \begin{tabular}{@{}c|c@{}}
            L2 & Per core: 1MiB \\
            L1 & Per core: d-cache 64KiB, i-cache 64KiB \\
        \end{tabular} \\
        \hline
        Memory & 1: 256 GiB DDR4-3200 DIMM ECC \\
        \hline
        Kernel & Linux 6.7.0 (modified) SMP Preemptive \\
        \hline
        Distribution & Ubuntu 22.04 LTS (Jammy Jellyfish) \\
        \hline
    \end{tabular}
    \caption{Specification of \texttt{rose}}
    \label{table:rose}
\end{table}

Additional to virtualized testbench, I have had the honor to access \texttt{rose}, a ARMv8 server rack system hosted by \href{https://systems-nuts.com}{\textit{Systems Nuts Research Group}} at the \textit{Informatics Forum}, through the invaluable assistance of my primary advisor, \textit{Amir Noohi}, for instrumentation of similar experimental setups on server-grade bare-metal systems.

The specifications of \texttt{rose} is listed in table \ref{table:rose}.

\section{Methodology}\label{sec:sw-coherency-method}
\subsection{Exporting \texttt{dcache\_clean\_poc}}
As established in subsection \ref{subsec:armv8a-swcoherency}, software cache-coherence maintenance operations (e.g., \texttt{dcache\_[clean|inval]\_poc}) are wrapped behind DMA API function calls and are hence unavailable for direct use in drivers. Moreover, instrumentation of assembly code becomes non-trivial when compared to instrumenting C function symbols, likely due to automatically stripped assembly symbols in C object files. Consequently, it becomes impossible to utilize the existing instrumentation tools available in the Linux kernel (e.g., \texttt{ftrace}) to trace assembly routines.

In order to convert \texttt{dcache\_clean\_poc} to a traceable equivalent, a wrapper function \texttt{\_\_dcache\_clean\_poc} is created as follows:
\begin{minted}[mathescape, linenos, bgcolor=code-bg]{c}
/* In arch/arm64/mm/flush.c */
#include <asm/cacheflush_extra.h>

/* ... */

void __dcache_clean_poc(ulong start, ulong end)
{
    dcache_clean_poc(start, end); // see $\hyperref[code:dcache_clean_poc]{\texttt{arch/arm64/mm/cache.S}}$
}
EXPORT_SYMBOL(__dcache_clean_poc);
\end{minted}

Correspondingly, the header \texttt{arch/arm64/include/asm/cacheflush\_extra.h} is created to export the symbol \texttt{\_\_dcache\_clean\_poc} into kernel module namespace. This has the additional benefit of creating a corresponding \texttt{ftrace} target, allowing the symbol to be instrumented using existing Linux instrumentation mechanisms. The entirety of modifications done to the in-tree v6.7.0 kernel culminates to a 44-line patch file (inclusive of metadata, context, etc.). It is expected that the introduction of additional symbols would increment the function latency by (at least) the amount of time necessary to fetch the instruction, but such latency is expected to be miniscule when compared to cache coherency operations.

\subsection{Kernel Module: \texttt{my\_shmem}}
To simulate module-initiated cache coherence behavior over allocated kernel buffers, a kernel module, \texttt{my\_shmem}, is written such that specially-written userspace programs could cause the kernel to invoke \texttt{\_\_dcache\_clean\_poc} at will.

\subsubsection{\texttt{my\_shmem}: Design}
The \texttt{my\_shmem} module is a utility for (lazily) allocating one or more kernel-space pages, re-mapping them into the userspace for reading/writing operations, and invoking cache-coherency operations \emph{as if} accessed via DMA on unmap.

To emulate \hyperref[def:streaming-dma-map]{streaming DMA mapping} allocation, the module is designed to allocate memory directly from the \textit{page allocator}, as required by the kernel documentation's guideline, \textit{What Memory is DMA'able?}\cite{Miller_Henderson_Jelinek.Kernelv6.7-DMA_guide.2024}:
\begin{displayquote}
    If you acquired your memory via the page allocator (i.e. \texttt{\_\_get\_free\_page*()}) or the generic memory allocators (i.e. \texttt{kmalloc()} or \texttt{kmem\_cache\_alloc()}) then you may DMA to/from that memory using the addresses returned from those routines.
\end{displayquote}

To enable page sharing between user-space processes, the module implements a allocation accounting mechanism for re-mapping existing allocations to multiple user-space address spaces on-demand. Specifically, it involves:
\begin{itemize}
    \item {
        Allocation of contiguous pages to some user-specified order (i.e., $2^{order}$ pages).
    }
    \item {
        Correct re-mapping behavior of existing allocations, for example computing the correct offset when re-mapping a multi-page allocation during any given page-fault, which may not be aligned with the first page in the allocation.
    }
    \item {
        Software cache coherency maintenance on removal of mapping from any user-space program. This is intended to simulate the behavior of DMA API in a system without any specific DMA hardware.
    }
\end{itemize}

The module should hence support userspace programs to be able to perform as follows:
\begin{enumerate}
    \item {
        Open the ``device'' file as exposed by the kernel module.
    }
    \item {
        \texttt{mmap} on the opened file descriptor, as per POSIX syscall API.
    }
    \item {
        Allocate memory due to load/store actions within the \texttt{mmap}-ed memory mapping.
    }
    \item {
        Close the memory mapping, which initiates a simulated software cache coherency maintenance operation.
    }
\end{enumerate}

\subsubsection{\texttt{my\_shmem}: Implementation}
To implement the features as specified, \texttt{my\_shmem} exposes itself as a character device file \texttt{/dev/my\_shmem}; implements \textit{file operations} \texttt{open}, \texttt{mmap}, and \texttt{release}; and implements \textit{vm operations} \texttt{close} and \texttt{fault}.

Additionally, the parameter \texttt{max\_contiguous\_alloc\_order} is exposed as a writable parameter file inside \textit{sysfs} to manually control the number of contiguous pages allocated per module allocation.

The entire kernel module used for experiment amount to around 400 lines of kernel-space code.

\paragraph*{Data Structures} \label{para:data-structs}
The primary functions of \texttt{my\_shmem} is to provide correct accounting of current allocations via the kernel module in addition to allocating on-demand. Hence, to represent a in-kernel allocation of multi-page contiguous buffer, define \texttt{struct my\_shmem\_alloc} as follows:
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
struct my_shmem_alloc {
    struct page *page; // GFP alloc repr, points to HEAD page
    ulong alloc_order; // alloc buffer length: $2^{\texttt{alloc\_order}}$
    struct list_head list; // kernel repr of doubly linked list
};
\end{minted}

\texttt{.list} defines the Linux kernel implementation of a element of a generically-typed doubly linked list, such that multiple allocations could be kept during the lifetime of the module. The corresponding linked list is defined as follows:
\begin{minted}[bgcolor=code-bg]{c}
static LIST_HEAD(my_shmem_allocs);
\end{minted}

To book-keep the real amount of pages allocated during the module's lifetime, define:
\begin{minted}[bgcolor=code-bg]{c}
static size_t my_shmem_page_count;
\end{minted}

Finally, to ensure mutual exlusion of the module's critical sections while running inside a \textit{SMP} (\textit{Symmetric Multi-Processing}) kernel, define mutex:
\begin{minted}[bgcolor=code-bg]{c}
static DEFINE_MUTEX(my_shmem_allocs_mtx);
\end{minted}
This protects all read/write operations to \texttt{my\_shmem\_allocs} and \texttt{my\_shmem\_page\_count} against concurrent module function calls.

\paragraph*{File Operations} \label{para:file_operations}
The Linux kernel defines \textit{file operations} as a series of module-specific callbacks whenever the userspace invokes a corresponding syscall on the (character) device file. These callbacks may be declared inside a \texttt{file\_operations} struct\cite{Corbet_Rubini_K-Hartman.LDD3.2005}, which provides an interface for modules on file-related syscalls:
\begin{minted}[linenos, bgcolor=code-bg, mathescape]{c}
/* In include/linux/fs.h */
struct file_operations {
    struct module *owner;
    /* ... */
    int (*mmap) (
        struct file *,          // opened (device) file
        struct vm_area_struct * // kernel repr of mapping
    ); // Downstream of syscall: mmap
    /* ... */
    int (*open) (
        struct inode *, // inode of file to be opened
        struct file *   // opened (generic) file
    ); // Downstream of libc: open
    /* ... */
    int (*release) (
        struct inode *, // inode of file to be closed
        struct file *   // to be closed
    ); // Downstream of libc: close
    /* ... */
} __randomize_layout;
\end{minted}

The corresponding structure for the particular module is hence defined as follows:
\begin{minted}[linenos, bgcolor=code-bg, mathescape]{c}
/* In my_shmem.c */
static const struct file_operations my_shmem_fops = {
    .owner = THIS_MODULE,
    .open = my_shmem_fops_open,
    .mmap = my_shmem_fops_mmap,
    .release = my_shmem_fops_release,
};
\end{minted}

Implementation of \texttt{.open} is simple. It suffices to install the module-specific \texttt{struct file\_operations} (i.e., \texttt{my\_shmem\_fops}) into the \texttt{struct file} passed in argument, which is constructed downstream via kernel's generic file opening mechanisms.

Likewise for \texttt{.release}, which does nothing except to print a debug message into the kernel ring buffer.

To implement \texttt{.mmap}, the kernel module attempts to \emph{re-map as much allocations into the given \texttt{struct vm\_area\_struct} as possible without making any allocation}. This centralizes allocation logic into the page fault handler, which is described later in \ref{para:vm_operations_struct}.
\begin{minted}[linenos, bgcolor=code-bg, mathescape]{c}
static int my_shmem_fops_mmap(
    struct file *filp,
    struct vm_area_struct *vma
) {
    int ret = 0;
    const ulong vma_pg_count =
        (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
    struct page *pg;
    ulong tgt_addr = vma->vm_start; // Current remap target addr
    ulong src_head_pfn; // Current remap source: head PFN
    ulong src_pg_nr;    // Current remap source: length
    ulong vma_remainder_count = vma_pg_count; // vma: remain pgs

    /* Lock mutex... */
    /* Iterate over allocations, remap as much as possible */
    struct my_shmem_alloc *curr;
    list_for_each_entry(curr, &my_shmem_allocs, list) {
        /* exit if all of vma is mapped */
        if (tgt_addr >= vma->vm_end)
            break;

        /* decrement page offset until alloc intersects */
        if (vma_pgoff > ORDER_TO_PAGE_NR(curr->alloc_order)) {
            vma_pgoff -= ORDER_TO_PAGE_NR(curr->alloc_order);
            continue;
        }

        /* intersects, hence compute PFN to remap */
        pg = curr->page;
        get_page(pg); // increment alloc. refcount
        src_head_pfn = page_to_pfn(pg) + vma_pgoff;
        src_pg_nr = min(
            vma_remainder_count,
            ORDER_TO_PAGE_NR(curr->alloc_order) - vma_pgoff
        );
        ret = remap_pfn_range(
            vma,                   // remap target VM area
            tgt_addr,              // page-aligned tgt addr
            src_head_pfn,          // kernel PFN as source
            src_pg_nr * PAGE_SIZE, // size of remap region
            vma->vm_page_prot,     // page protection flags
        );
        /* if (ret): goto error handling... */
        /* Prepare for next iteration */
        tgt_addr += src_pg_nr * PAGE_SIZE;
        vma_remainder_count -= src_pg_nr;
    }

    /* return or error handling... */
}
\end{minted}

\paragraph*{VM Operations}\label{para:vm_operations_struct}
On \texttt{mmap}, the Linux kernel installs a new \textit{VMA} (\textit{Virtual Memory Area}) as the internal representation for the corresponding mapping in process address space\cite{Corbet_Rubini_K-Hartman.LDD3.2005}. Likewise file operations, kernel modules may implement callbacks in \texttt{vm\_operations\_struct} to define module-specific operations per VMA access at userspace:
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
/* In include/linux/mm.h */
struct vm_operations_struct {
    /* ... */
    void (*close)(struct vm_area_struct * area);
    /* ... */
    vm_fault_t (*fault)(
        struct vm_fault *vmf // Page fault descriptor
    ); // Page fault handler
    /* ... */
};
\end{minted}

The corresponding structure for the particular module is hence defined as follows:
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
/* In my_shmem.c */
static const struct vm_operations_struct my_shmem_vmops = {
    .close = my_shmem_vmops_close,
    .fault = my_shmem_vmops_fault,
};
\end{minted}

Function \texttt{.fault} is implemented such that allocations are performed lazily until the number of pages allocated inside the module superseeds the faulting page offset wrt. its mapping. A simple implementation of this is to, given the number of pages allocated is insufficient to service this page fault, continuously allocate unitl this condition becomes valid:
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
static vm_fault_t my_shmem_vmops_fault(struct vm_fault *vmf)
{
    vm_fault_t ret = VM_FAULT_NOPAGE; // See $\ref{quirk:VM_FAULT_NOPAGE}$
    ulong tgt_offset = vmf->vma->vm_pgoff + vmf->pgoff;

    /* Lock mutex... */
    for (;;) {
        /* When we already allocated enough, remap */
        if (tgt_offset < my_shmem_page_count)
            return __my_shmem_fault_remap(vmf); // See $\ref{quirk:__my_shmem_fault_remap}$

        /* Otherwise, allocate $2^{order}$ pages and retry */
        struct my_shmem_alloc *new_alloc_handle = kzalloc(
            sizeof(struct my_shmem_alloc),
            GFP_KERNEL, // kernel-only allocation rule flag
        );
        /* if (!new_alloc_handle) goto error handling... */

        struct page *new_alloc_pg = alloc_pages(
            GFP_USER, // user-remapped kernel alloc rule flag
            max_contiguous_alloc_order,
        ); // Alloc $2^{order}$ pages
        /* if (!new_alloc_pg) goto error handling... */

        /* Fill in handle data */
        new_alloc_handle->page = new_alloc_pg;
        new_alloc_handle->alloc_order = max_contiguous_alloc_order;
        /* Add `new_alloc_handle` to `my_shmem_allocs`... */

        /* Prepare for next iteration */
        my_shmem_page_count +=
            ORDER_TO_PAGE_NR(new_alloc_handle->alloc_order);
    }

    /* Error handling... */
}
\end{minted}

Several implementation quirks that warrant attention are as follows:
\begin{enumerate}
    \item {\label{quirk:VM_FAULT_NOPAGE}
        \texttt{my\_shmem\_vmops\_fault} returns \texttt{VM\_FAULT\_NOPAGE} on success. This is due to the need to support multi-page contiguous allocation inside the kernel module for performance analysis purposes.

        Usually, the \texttt{vm\_operations\_struct} API expects its \texttt{.fault} implementations to assign \texttt{struct page *} to \texttt{vmf->page} on return. Here, \texttt{vmf->page} represents the page-aligned allocation that is to be installed into the faulting process's page table, thereby resolving the page fault.

        However, this expectation causes a conflict between the module's ability to allocate multi-page contiguous allocations and its ability to perform page-granularity mapping of underlying allocations (no matter the size of the allocation). Because \textit{GFP}-family of page allocators use \texttt{struct page} as the representation of the \emph{entire} allocation (no matter the number of pages actually allocated), it is incorrect to install the \texttt{struct page} representation of a multi-page contiguous allocation to any given page fault in case that the page fault offset is misaligned with the alignment of the allocation (an example of such case arising could be found at \ref{fig:misaligned-remap}).

        \begin{figure}[h]
            \centering
            \includegraphics[scale=0.8]{graphics/tikz-misaligned-remap.pdf}
            \caption{Misaligned Kernel Page Remap. Left column represents physical memory (addressed by PFN); center column represents in-module accounting of allocations; right column represents process address space.}
            \label{fig:misaligned-remap}
        \end{figure}

        Consequently, \texttt{VM\_FAULT\_NOPAGE} is raised to indicate that \emph{\texttt{vmf->page} would not be assigned with a reasonable value, and the callee guarantees that corresponding page table entries would be installed when control returns to caller}. The latter guarantee is respected with the use of \texttt{remap\_pfn\_range}, which eventually calls into \texttt{remap\_pte\_range}, thereby modifying the page table.
    }
    \item {\label{quirk:__my_shmem_fault_remap}
        \texttt{\_\_my\_shmem\_fault\_remap} serves as inner logic for when outer page fault handling (allocation) logic deems that a sufficient number of pages exist for handling the current page fault. As its name suggests, it finds and remaps the correct allocation into the page fault's parent VMA (assuming that such allocation, of course, exists).

        The logic of this function is similar to \hyperref[para:file_operations]{\texttt{my\_shmem\_fops\_mmap}}. For a code excerpt listing, refer to appendix \ref{appendix:__my_shmem_fault_remap}.
    }
\end{enumerate}

Function \texttt{.close} emulates synchronization behavior whenever a VMA is removed from a process's address space (e.g., due to \texttt{munmap}). Given a removed VMA as argument, it computes the intersecting allocations and invokes \hyperref[code:dcache_clean_poc]{\texttt{dcache\_clean\_poc}} on each such allocations. While this results in conservative approximation of cleaned cache entries, it is nevertheless good for instrumentation purposes, as the amount of pages cleaned per invocation becomes invariable with respect to how the VMA was remapped -- a misaligned VMA will not result in less pages being flushed in a given allocation.

\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
static void my_shmem_vmops_close(struct vm_area_struct *vma)
{
    size_t vma_pg_count =
        (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
    size_t vma_pg_off = vma->vm_pgoff;

    /* Lock mutex... */
    struct my_shmem_alloc *entry;
    list_for_each_entry(entry, &my_shmem_allocs, list) {
        const ulong entry_pg_count =
            ORDER_TO_PAGE_NR(entry->alloc_order);

        /* Loop till entry intersects with start of VMA */
        if (vma_pg_off > entry_pg_count) {
            vma_pg_off -= entry_pg_count;
            continue;
        }

        /* All of VMA cleaned: exit */
        if (!vma_pg_count)
            break;

        /* entry intersects with VMA -- emulate clean */
        struct page *pg = entry->page;
        ulong kvaddr_bgn = (ulong) page_address(pg);
        ulong kvaddr_end =
            kvaddr_bgn + entry_pg_count * PAGE_SIZE;
        __dcache_clean_poc(kvaddr_bgn, kvaddr_end); // See $\ref{code:dcache_clean_poc}$
        put_page(pg); // decrement refcount

        /* Prepare for next iteration */
        vma_pg_count -= min(
            entry_pg_count - vma_pg_off,
            vma_pg_count
        );
        if (vma_pg_off != 0) // ~ first intersection
            vma_pg_off = 0;
    }

    /* cleanup... */
}
\end{minted}

\paragraph*{\textit{sysfs} Parameter} \label{para:sysfs-param}
Finally, \texttt{my\_shmem} exposes a tunable \textit{sysfs} parameter for adjusting the number of pages allocated per allocation in \texttt{my\_shmem\_vmops\_fault}. The parameter, \texttt{max\_contiguous\_alloc\_order}, defines the order $o$ for allocation from page allocator such that, for each allocation, $2^o$ contiguous pages are allocated at once.

To adjust the parameter (for example, set $o \leftarrow 2$), one may run as follows in a sh-compatible terminal:
\begin{minted}[bgcolor=code-bg]{sh}
$ echo 2 > \
    /sys/module/my_shmem/parameters/max_contiguous_alloc_order
\end{minted}

Consequently, all allocations occuring after this change will be allocated with a 4-page contiguous granularity. Upon further testing, the maximum value allowed here is 10 (i.e., $2^{10} = 1024$ 4K pages).

\subsection{Instrumentation: \texttt{ftrace} and \texttt{bcc-tools}}
We use two instrumentation frameworks to evaluate the latency of software-initiated coherency operations. \texttt{ftrace} is the primary kernel tracing mechanism across multiple (supporting) architectures, which supports both \textit{static} tracing of tracepoints and \textit{dynamic} tracing of function symbols:
\begin{itemize}
    \item {
        \textbf{Static} tracepoints describe tracepoints compiled into the Linux kernel. They are defined by kernel programmers and is otherwise known as \textit{event tracing}.
    }
    \item {
        \textbf{Dynamic} \texttt{ftrace} support is enabled by self-modifying the kernel code to replace injected placeholder nop-routines with \texttt{ftrace} infrastructure calls. This allows for function tracing of all function symbols present in C object files created for linkage. \cite{Rostedt.Kernelv6.7-ftrace.2023}
    }
\end{itemize}

Because we do not inline \texttt{\_\_dcache\_clean\_poc}, we are able to include its symbol inside compiled C object files and hence expose its internals for dynamic tracing.

\texttt{bcc-tools}, on the other hand, provide an array of handy instrumentation tools that is compiled just-in-time into \textit{BPF} programs and ran inside a in-kernel virtual machine. Description of how BPF programs are parsed and run inside the Linux kernel is documented in the kernel documentations \cite{N/A.Kernelv6.7-libbpf.2023}. The ability of \texttt{bcc}/\texttt{libbpf} programs to interface with both userspace and kernelspace function tracing mechanisms make \texttt{bcc-tools} ideal as a easy tracing interface for both userspace and kernelspace tracing.

\subsection{Userspace Programs}
Finally, two simple userspace programs are written to invoke the corresponding kernelspace callback operations -- namely, allocation and cleaning of kernel buffers for simulating DMA behaviors. To achieve this, it simply \texttt{mmap}s the amount of pages passed in as argument and either reads or writes the entirety of the buffer (which differentiates the two programs). A listing of their logic is at Appendix \ref{appendix:userspace}.

\section{Results}\label{sec:sw-coherency-results}
\subsection{Controlled Allocation Size; Variable Allocation Count}
Experiments are first conducted over software coherency operation latencies over variable \texttt{mmap}-ed memory area sizes while keeping the underlying allocation sizes to 4KiB (i.e., single-page allocation). All experiments are conducted on \texttt{star} on \texttt{mmap} memory areas ranged from 16KiB till 1GiB, in which we control the number of sampled coherency operations to 1000. Data gathering is performed using the \texttt{trace-cmd} front-end for \texttt{ftrace}. The results of the experiments conducted is listed in figure \ref{fig:coherency-op-per-page-alloc}.

\begin{figure}[p]
    \centering
    \begin{subfigure}{.8\textwidth}
        \centering
        \includegraphics[width=\textwidth]{graphics/out-95p-new.pdf}
    \end{subfigure}
    \begin{subfigure}{.8\textwidth}
        \centering
        \includegraphics[width=\textwidth]{graphics/out-log-new.pdf}
    \end{subfigure}
    \caption{Coherency operation latency. Allocation on per-page basis. Vertical lines represent 25th, 50th, and 75th percentiles respectively.}
    \label{fig:coherency-op-per-page-alloc}
\end{figure}

Additionally, we also obtain the latencies of TLB flushes due to userspace programs, as listed in figure \ref{fig:coherency-op-tlb}.

\begin{figure}[p]
    \centering
    \begin{subfigure}{.8\textwidth}
        \centering
        \includegraphics[width=\textwidth]{graphics/tlb-95p.pdf}
    \end{subfigure}
    \begin{subfigure}{.8\textwidth}
        \centering
        \includegraphics[width=\textwidth]{graphics/tlb-log.pdf}
    \end{subfigure}
    \caption{TLB operation latency. Allocation on per-page basis. Vertical lines represent 25th, 50th, and 75th percentiles respectively.}
    \label{fig:coherency-op-tlb}
\end{figure}

\subsubsection*{Notes on Long-Tailed Distribution} \label{subsec:long-tailed}
We identify that a long-tailed distribution of latencies exist for both figures (\ref{fig:coherency-op-per-page-alloc}, \ref{fig:coherency-op-multi-page-alloc}). For software coherency operations, we identify this to be partially due to \textit{softirq} preemption (notably, RCU maintenance), which take higher precendence compared to ``regular'' kernel routines. A brief description of \textit{processor contexts} defined in the Linux kernel is listed in \textcolor{red}{Appendix ???}.

For TLB operations, we identify the cluster of long-runtime TLB flush operations (e.g., around $10^4$ {\mu}s) to be interference from \texttt{mm} cleanup on process exit.

Moreover, latencies to software coherency operations are highly system-specific. On \texttt{rose}, data gathered from similar experimentations have shown to be $1/10$-th of the latencies gathered from \texttt{star}, which (coincidentially) reduces the likelihood of long-tailed distributions forming due to RCU \textit{softirq} preemption.

\subsection{Controlled Allocation Count; Variable Allocation Size} \label{sec:experiment-var-alloc-cnt}
We also conduct experiments over software coherency operations latencies over fixed \texttt{mmap}-ed memory area sizes while varying the underlying allocation sizes. This is achieved by varying the allocation order -- while 0-order allocation allocates $2^0 = 1$ page per allocation, a 8-order allocation allocates $2^8 = 256$ contiguous pages per allocation. All experiments are conducted on \texttt{star}. The results for all experiments are gathered using \texttt{bcc-tools}, which provide utilities for injecting \textit{BPF}-based tracing routines. The results of these experimentations are visualized in figure \ref{fig:coherency-op-multi-page-alloc}, with $N \ge 64$ per experiment.

\begin{figure}[p]
    \centering
    \includegraphics[width=.8\textwidth]{graphics/var_alloc_size.pdf}
    \caption{Average coherency op latency of variable-order contiguous allocation.}
    \label{fig:coherency-op-multi-page-alloc}
\end{figure}

\begin{table}[p]
    \centering
    \begin{tabular}{|c|c c c c|}
        \hline
            Order
        & 25p
        & 50p (Median)
        & 75p
        & 99p \\
        \hline
            0
        & 5.968
        & 9.808
        & 15.808
        & 58.464 \\
        \hline
            2
        & 8.960
        & 13.152
        & 17.776
        & 39.184 \\
        \hline
            4
        & 19.216
        & 21.120
        & 23.648
        & 123.984 \\
        \hline
            6
        & 67.376
        & 70.352
        & 74.304
        & 103.120 \\
        \hline
            8
        & 278.784
        & 303.136
        & 324.048
        & 1783.008 \\
        \hline
            10
        & 1050.752
        & 1141.312
        & 1912.576
        & 2325.104 \\
        \hline
    \end{tabular}
    \caption{Coherency op latency of Variable-order Contiguous Allocation. Time listed in {\mu}s. $N = 100$ across allocation orders.}
    \label{table:coherency-op-multi-page-alloc}
\end{table}

\section{Discussion}\label{sec:sw-coherency-discuss}
Figures \ref{fig:coherency-op-per-page-alloc}, \ref{fig:coherency-op-multi-page-alloc} exhibits that, in general, coherency maintenance operation is \textbf{unrelated with the size of the mapped memory area} and \textbf{correlated with how large a single contiguous allocation is made}. We especially note that the runtime of each software-initiated coherency maintenance operation \textbf{does not grow linearly with allocation size}. Given that both axis of figure \ref{fig:coherency-op-multi-page-alloc} is on a log-scale, with the ``order'' axis interpretable as a ${\log}_2$ scale of number of contiguous 4K pages, a perfect linear correlation between allocation size and latency would see a roughly linear interpolation between the data points. This is obviously not the case for figure \ref{fig:coherency-op-multi-page-alloc}, which sees software coherency operation latency increasing drastically once order $\ge$ 6 (i.e., 64 contiguous pages), but remain roughly comparable for smaller orders.

On the other hand, linearly increasing coherency operation latencies exhibited for higher-order allocations have their runtimes amortized by two factors:
\begin{enumerate}
    \item { \label{factor:1}
        Exponentially-decreasing number of buffers (allocations) made in the underlying kernel module, which corresponds to less memory allocation calls made during runtime.
    }
    \item { \label{factor:2}
        Latency of contiguous allocation operations (i.e., \texttt{alloc\_pages}) \textbf{does not} grow significantly in relation to the size of the allocation.
    }
\end{enumerate}

Due to both factors, it remains economic to allocate larger contiguous allocations for DMA pages that are subject to frequent cache coherency maintenance operations than applying a ``scatter-gather'' paradigm to the underlying allocations.

\subsection{\textit{Hugepages} and RDMA-based DSM}
\textit{Hugepage} is an architectural feature that allows an aligned, larger-than-page-size contiguous memory region to be represented using a single TLB entry. x86-64, for example, supports (huge)pages to the size of 4KiB, 2MiB, or 1GiB \cite{N/A.Kernelv6.7-hugetlb.2023}. ARM64 supports a more involved implementation of TLB entries, allowing it to represent more variable pages sizes in one TLB entry (up to 16GiB) \cite{N/A.Kernelv6.7-arm64-hugetlb.2023}. Hypothetically, using hugepages as backing store for very large RDMA buffers reduces address translation overhead, either by relieving TLB pressure or through reduced page table indirections \cite{Yang_Izraelevitz_Swanson.FileMR-RDMA.2020}.

Specifically, the kernel developers identify the following factors that allow \textit{hugepages} to create faster large-working-set programs \cite{N/A.Kernelv6.7-transparent-hugepage.2023}:
\begin{enumerate}
    \item {
        TLB misses run faster.
    }
    \item {
        A single TLB entry corresponds to a much larger section of virtual memory, thereby reducing miss rate.
    }
\end{enumerate}

In general, performance critical computing applications dealing with large memory working sets will be running on top of \textit{hugetlbfs} -- hugepage mechanism exposed by the Linux kernel to userspace \cite{N/A.Kernelv6.7-transparent-hugepage.2023}. Alternatively, the use of hugepages could be dynamically and transparently enabled and disabled in userspace using \textit{transparent hugepages} supported by contemporary Linux kernels \cite{N/A.Kernelv6.7-transparent-hugepage.2023}. This enhances programmer productivity in userspace programs when relying on a hypothetical \textit{transparent hugepage}-enabled in-kernel DSM system for heterogenenous data processing tasks on variable-sized buffers, though few in-kernel mechanisms actually incorporate \textit{transparent hugepages} support -- at the time of writing, only anonymous \textit{vma}s (e.g., stack, heap, etc.) and \textit{tmpfs/shmem} incorporates \textit{transparent hugepage} \cite{N/A.Kernelv6.7-transparent-hugepage.2023}.

We identify \textit{transparent hugepage} support as one possible direction to improving in-kernel DSM system performance. Traditionally, userspace programs who really wishes to allocate hugepages rely on \textit{libhugetlbfs} as interface to the Linux kernel's \textit{hugetlbfs} mechanism. These techniques remain heavily reliant on programmer discretion which is fundamentally at odds with what the parent project of this paper envisions: a remote compute node is exposed as a DMA-capable accelerator to another, whereby two compute nodes could transparently perform computation on each other's memory via heterogeneous memory management mechanism. Because this process is transparent to the userspace programmer (who only have access to e.g., \texttt{/dev/my\_shmem}), ideally the underlying kernel handler to \texttt{/dev/my\_shmem} should abstract away the need for hugepages for very large allocations (since this is not handled by \textit{libhugetlbfs}). Furthermore, transparent hugepage support would also hypothetically allow for shared pages to be promoted and demoted on ownership transfer time, thereby allowing for dynamically-grained memory sharing while maximizing address translation performance.

Furthermore, further studies remains necessary to check whether the use of (transparent) hugepages significantly benefit a real implementation of an in-kernel DSM system. Current implementation for \texttt{alloc\_pages} does not allow for allocation of hugepages even when the allocation order is sufficiently large. Consequently, future studies need to examine alternative implementations that incorporate transparent hugepages into the DSM system. One candidate that could allow for hugepage allocation, for example, is to directly use \texttt{alloc\_pages\_mpol} instead of \texttt{alloc\_pages}, as is the case for the current implementation of \textit{shmem} in kernel.

\subsection{Access Latency Post-\textit{PoC}}
This chapter solely explores latencies due to software cache coherency operations. In practice, it may be equally important to explore the latency incurred due to read/write accesses after \textit{PoC} is reached, which is almost always the case for any inter-operation between CPU and DMA engines.

Recall from section \ref{subsec:armv8a-swcoherency} that ARMv8-A defines \textit{Point} of Coherency/Unification within its coherency domains. In practice, it often implies an actual, physical \emph{point} to which cached data is evicted to:
\begin{itemize}
    \item {
        Consider a ARMv8-A system design with a shared L2/lowest-level cache that is also snooped by the DMA engine. Here, the \textit{Point-of-Coherency} could be defined as the shared L2 cache to which higher-level cache entries are cleaned or invalidated.
    }
    \item {
        Alternatively, a DMA engine may be capable of snooping all processor caches. The \textit{Point-of-Coherency} could then be defined merely as the L1 cache, with some overhead depending on how the DMA engine accesses these caches.
    }
\end{itemize}

Further studies are necessary to examine the latency after coherency maintenance operations on ARMv8 architectures on various systems, including access from DMA engine vs. access from CPU, etc.

\subsection{Reflection}
We identify the following weaknesses within our experiment setup that undermines the generalizability of our work.

\paragraph*{What About \texttt{dcache\_inval\_poc}?} Due to time constraints, we were unable to explore the latencies posed by \texttt{dcache\_inval\_poc}, which will be called whenever the DMA driver attempts to prepare the CPU to access data modified by DMA engine. Further studies that expose \texttt{dcache\_inval\_poc} for similar instrumentation should be trivial, as the steps necessary should mirror the case for \texttt{dcache\_clean\_poc} listed above.

\paragraph*{Do Instrumented Statistics Reflect Real Latency?} It remains debateable whether the method portrayed in section \ref{sec:sw-coherency-method}, specifically via exporting \texttt{dcache\_clean\_poc} to driver namespace as a traceable target, is a good candidate for instrumenting the ``actual'' latencies incurred by software coherency operations.

For one, we specifically opt not to disable IRQ when running \texttt{\_\_dcahce\_clean\_poc}. This mirrors the implementation of \texttt{arch\_sync\_dma\_for\_cpu}, which:
\begin{enumerate}
    \item {
        is (at least) called under process context.
    }
    \item {
        does not disable IRQ downstream.
    }
\end{enumerate}
Similar context is also observed for upstream function calls, for example \\ \texttt{dma\_sync\_single\_for\_device}. As a consequence, kernel routines running inside IRQ/\textit{softirq} contexts are capable of preempting the cache coherency operations, hence preventing early returns. The effect of this on tail latencies have been discussed in section \ref{subsec:long-tailed}.
% [XXX] that \\ has to be here, else texttt simply refuses to wrap

On the other hand, it may be argued that analyzing software coherency operation latency on a hardware level better reveals the ``real'' latency incurred by coherency maintenance operations during runtime. Indeed, latencies of \texttt{clflush}-family of instructions performed on x86 chipsets measured in units of clock cycles \cites{Kim_Han_Baek.MARF.2023}{Fog.Instr-table-x86.2018} amount to around 250 cycles -- significantly less than microsecond-grade function call latencies for any GHz-capable CPUs. We argue that because an in-kernel implementation of a DSM system would more likely call into the exposed driver API function calls as opposed to individual instructions -- i.e., not writing inline assemblies that ``reinvent the wheel'' -- instrumentation of relatively low-level and synchronous procedure calls is more crucial than instrumenting individual instructions.

\paragraph*{Lack of Hardware Diversity} The majority of data gathered throughout the experiments come from a single, virtualized setup which may not be reflective of real latencies incurred by software coherency maintenance operations. While similar experiments have been conducted in bare-metal systems such as \texttt{rose}, we note that \texttt{rose}'s \textit{Ampere Altra} is certified \textit{SystemReady SR} by ARM \cite{ARM.SystemReady_SR.2024} and hence supports hardware-coherent DMA access (by virture of \textit{ARM Server Base System Architecture} which stipulates hardware-coherent memory access as implemented via MMU) \cite{ARM.SBSAv7.1.2022}, and hence may not be reflective of any real latencies incurred via coherency maintenance.

On the other hand, we note that a growing amount of non-hardware-coherent ARM systems with DMA-capable interface (e.g., PCIe) are quickly becoming mainstream. Newer generation of embedded SoCs are starting to feature PCIe interface as part of their I/O provisions, for example \textit{Rockchip}'s \textit{RK3588} \cite{Rockchip.RK3588.2022} and \textit{Broadcom}'s \textit{BCM2712} \cite{Raspi.Rpi5-datasheet.2023}, both of which were selected for use in embedded and single-board systems, though (at the time of writing) with incomplete kernel support. Moreover, desktop-grade ARM CPUs and SoCs are also becoming increasingly common, spearheaded by \textit{Apple}'s \textit{M}-series processors as well as \textit{Qualcomm}'s equivalent products, all of which, to the author's knowledge, \textbf{do not} implement hardware coherence with their PCIe peripherals. Consequently, it is of interest to evaluate the performance of software-initiated cache coherency operations commonly applied in CPU-DMA interoperations on such non-\textit{SystemReady SR} systems.

Orthogonally, even though the \textit{virt} emulated platform does not explicitly support hardware-based cache coherency operations, the underlying implementation of its emulation on x86 hosts is not explored in this study. Because (as established) the x86 ISA implements hardware-level guarantee of DMA cache coherence, if no other constraints exist, it may be possible for a ``loose'' emulation of the ARMv8-A ISA to define \textit{PoC} and \textit{PoU} operations as no-ops instead, though this theory cannot be ascertained without any cross-correlation with \textit{virt}'s source code. Figure \ref{fig:coherency-op-multi-page-alloc} also strongly disputes this theory, as a mapping from ARMv8-A \textit{PoC} instructions to x86 no-op instructions would likely not cause differing latency magnitude over variable-sized contiguous allocations.

\paragraph*{Inconsistent Latency Magnitudes Across Experiments} We recognize that latencies differ over similar experimental setups between the 2 subsections of \ref{sec:sw-coherency-results}. We strongly suspect that this is due to uncontrolled power supply to host machine allowing the \textit{System Management Unit} of the host system to downclock or alter the performance envelope of the host CPU. Similar correlation between power source and CPU performance across different \textit{Zen 2} chipset laptops have been observed \cite{Salter.AMD-Zen2-Boost-Delay.2020}. Though the reduced performance envelope would result in worse ARM64 emulation performance, the relative performances observed in figures \ref{fig:coherency-op-per-page-alloc} and \ref{fig:coherency-op-multi-page-alloc} should still hold, as this power management quirk should result in performance deduction of similar order-of-magnitude across instructions (in terms of latency via reduced clock frequencies). Nevertheless, furthrer studies should rely on controlled power source to eliminate variance caused by system power management functionalities.

\chapter{Conclusion}
This thesis hence accomplishes the following:
\begin{itemize}
    \item {
        It provides an timeline of development in software distributed shared memory systems throughout the ages, from the early (but still inspiring) \textit{Munin} to contemporary developments due to RDMA hardware -- \textit{MENPS}. Using this timeline, it introduces a novel approach to DSM systems that take a heterogeneous-multiprocessing view to the traditional DSM system problem, which serves as the rationale/context behind the primary contributions of this thesis.
    }
    \item {
        It underscores the interaction between the two coherence ``domains''\footnotemark[5] relevant to a DSM system -- the larger domain (between different nodes in a DSM abstraction) depends on the correct behaviors of the smaller domain (within each node, between RDMA NIC and the CPU) to exhibit correct consistency behaviors with regards to the entire DSM system. From here, it focuses on cache coherence in ARMv8 ISA systems after establishing that x86-64 systems already define DMA as transparently cache coherent.
    }
    \item {
        It describes the implementation of software-initiated ARMv8-A cache coherence operations inside the contemporary Linux kernel, which the thesis (and its contextual project) focuses on due to it being open-source and popular across all computing contexts. Specifically, it pinpoints the exact procedures relevant to cache coherence maintenance due to DMA in Linux kernel and explains its interoperation with the upstream DMA-capable hardware drivers.
    }
    \item {
        It establishes a method to re-export architecture-specific assembly routines inside the Linux kernel as dynamically-traceable C symbols and constructs a kernel module wrapper to conduct a series of experiments that explore the relationship between software coherence routine latency and allocation size. From this, it establishes that latency of routines grows with size of memory subspace to be made coherent, but with a non-linear growth rate.
    }
\end{itemize}

\footnotetext[5]{Not to be confused with ARM's definition of \textit{coherence domain} -- though theoretically similar. Here, a \textit{domain} refers to a level of abstraction where, given a set of nodes, each constituent node is \emph{internally} coherent as a whole but not guaranteed to be coherent with each other. Reused the term for lack of better descriptors.}

\section{Future \& Unfinished Work}
The main contribution of this thesis had swayed significantly since the beginning of semester 1\footnotemark[6]. During this thesis's incubation, the following directions had been explored which the author hope may serve as pointers for future contributions for the in-kernel, RDMA-based DSM system:

\footnotetext[6]{Educationally speaking it's, well, educative, but it would be a lie to claim this did not hurt morale.}

\paragraph*{Cache/Page Replacement Policies wrt. DSM Systems} Much like how this thesis proposed that \emph{2 coherence domains exist for a DSM system -- inter-node and intra-node}, the cache replacement problem also exhibits a (theoretical) duality:
\begin{itemize}
    \item {
        \textbf{Intra-node} cache replacement problem -- i.e., \emph{page replacement problem} inside the running OS kernel -- is made complex by the existence of remote ramdisks as possible swap target:

        Consider, for example, that \texttt{kswapd} scans some page for replacement. We may instead establish swap files over RDMA-reachable resources such that, at placement time, we have the following options:
        \begin{enumerate}
            \item {
                intra-node \texttt{zram}\footnotemark[7]
            }
            \item {
                inter-node \texttt{zram} over RDMA
            }
            \item {
                intra-node swapfile on-disk
            }
        \end{enumerate}

        Consequently, even swapped page placement becomes an optimization problem! To the author's knowledge, the Linux kernel currently does not support dynamic selection of swap target -- a static ordering is defined inside \texttt{/etc/fstab}, instead.
    }
    \item {
        \textbf{Inter-node} cache replacement problem, which arises because we may as well bypass \texttt{kswapd} altogether when pages are already transferred over the \textit{HMM} mechanism. This leads to one additional placement option during page replacement:
        \begin{enumerate}
            \setcounter{enumi}{3}
            \item inter-node page transfer via DSM-on-RDMA
        \end{enumerate}

        Because of significant overhead incurred by the Linux swap mechanism, this option may likely be the most lightweight for working set optimization. Interoperation between this mechanism and existing \texttt{kswapd}, however, is non-trivial.
    }
\end{itemize}

\footnotetext[7]{A compressed ramdisk abstraction in Linux. See \url{https://docs.kernel.org/admin-guide/blockdev/zram.html}}

\paragraph*{RDMA-Optimized Coherence Protocol} A coherence protocol design (and corresponding consistency model target) had been drafted during this thesis's creation. However, designing a correct and efficient coherence protocol that takes advantage of RDMA's one-sided communication proved to be non-trivial. We identify, however, that \emph{single-writer protocols} models well the device-CPU dichotomy to memory access and eases the protocol design significantly, as is the case for \textit{Hotpot} \cite{Shan_Tsai_Zhang.DSPM.2017}.


% \bibliographystyle{plain}
% \bibliographystyle{plainnat}
% \bibliography{mybibfile}
\printbibliography[heading=bibintoc]


% You may delete everything from \appendix up to \end{document} if you don't need it.
\appendix
% \chapter{Terminologies}
% This chapter provides a listing of all terminologies used in this thesis that may be of interest or warrant a quick-reference entry during reading.

\chapter{More on The Linux Kernel}
This chapter provides some extra background information on the Linux kernel that may have been mentioned or implied but bears insufficient significance to be explained in the \hyperref[chapter:background]{Background} chapter of this thesis.

\section{Processor Context}
The Linux kernel defines 3 contexts that the CPU could be running in at any time:
\begin{itemize}
    \item Hardware Interrupt (IRQ)
    \item Softirq / tasklet
    \item Process context (userspace or kernelspace)
\end{itemize}

The ordering between the contexts are top-to-bottom: hardware interrupt code can preempt softirq or process context code, and softirq code can preempt process context code only.

\section{\texttt{enum dma\_data\_direction}}\label{appendix:enum_dma_data_direction}
The Linux kernel defines 4 direction \texttt{enum} values for fine-tuning synchronization behaviors:
\begin{minted}[linenos, bgcolor=code-bg]{c}
/* In include/linux/dma-direction.h */
enum dma_data_direction {
    DMA_BIDIRECTION = 0, // data transfer direction uncertain.
    DMA_TO_DEVICE = 1,   // data from main memory to device.
    DMA_FROM_DEVICE = 2, // data from device to main memory.
    DMA_NONE = 3,        // invalid repr for runtime errors.
};
\end{minted}

These values allow for certain fast-paths to be taken at runtime. For example, asserting \texttt{DMA\_TO\_DEVICE} implies that the device reads data from memory without modification, and hence precludes software coherence instructions from being run when synchronizing for CPU after DMA operation.

\chapter{Extra}
This chapter provides a brief summary of some work that was done during the writing of the thesis, but the author decided against formal inclusion of into the submitted work.

\section{Listing: \texttt{\_\_my\_shmem\_fault\_remap}}\label{appendix:__my_shmem_fault_remap}
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
static vm_fault_t __my_shmem_fault_remap(struct vm_fault *vmf) {
    vm_fault_t ret = VM_FAULT_NOPAGE;
    const ulong   fault_addr = vmf->address;
    ulong         remap_addr = fault_addr;
    const pgoff_t  vma_pgoff = vmf->vma->pgoff;
    pgoff_t        vmf_pgoff = vma_pgoff + vmf->pgoff;

    /* either remap all alloced or remap entire vma */
    ulong remaining_remappable_pgs = min(
        my_shmem_page_count - vmf_pgoff,
        vma_pgoff + NR_PAGE_OF_VMA(vmf->vma) - vmf_pgoff
    );

    struct my_shmem_alloc *curr;
    pgoff_t curr_pg_off = 0; // `curr` as page ID
    pgoff_t next_pg_off;     // next of `curr` as page ID
    list_for_each_entry(curr, &my_shmem_allocs, list) {
        next_pg_off =
            curr_pg_off + ORDER_TO_PAGE_NR(curr->alloc_order);

        if (next_pg_off > vmf_pgoff) { // curr remappable
            get_page(curr->page);

            /* Compute head offset */
            pgoff_t off_from_alloc_head = vmf_pgoff - curr_pg_off;
            /* Compute nr of pages from head to remap */
            ulong remap_range_pgs = min(
                next_pg_off - curr_pg_off - off_from_alloc_head,
                remaining_remappable_pgs
            );
            ulong remap_range_bytes = remap_range_pgs * PAGE_SIZE;
            ulong remap_pfn =
                page_to_pfn(curr->page) + off_from_alloc_head;

            /* Remap */
            int remap_ret = remap_pfn_range(
                vmf->vma,
                remap_addr,
                remap_pfn,
                remap_range_bytes,
                vmf->vma->vm_page_prot;
            );
            /* if (remap_ret) goto error... */

            /* Prepare for next iteration */
            vmf_pgoff = next_pg_off;
            remappable_remaining_pgs -= remap_range_pgs;
            remap_addr += remap_range_bytes;
            if (remaining_remappable_pgs == 0)
                /* goto ok... */
        } else { // curr not in remap range
            curr_pg_off = next_pg_off;
        }
    }
    /* ... */
}
\end{minted}

\section{Listing: Userspace}\label{appendix:userspace}
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
int main(int argc, char *argv[]) {
    /* Set write & alloc amount */
    size_t page_count;
    /* parse_argument(argc, argv, &page_count); */

    const long PAGE_SIZE = sysconf(_SC_PAGESIZE);
    const size_t WRITE_AMNT = PAGE_SIZE * page_count;

    /* Open device file w/ RW perms */
    FILE *fp = fopen(DEVICE_PATH, "r+");
    /* if (!fp) error... */
    int fd = fileno(fp);
    /* if (fd == -1) error... */

    /* mmap device */
    void *buf = mmap(
        NULL,                   // addr to map to
        WRITE_AMNT,             // size_t len
        PROT_READ | PROT_WRITE, // int prot
        MAP_SHARED,             // int flags
        fd,                     // int fildes
        0                       // off_t off
    );
    /* if (!buf) error... */

    /* Write to mmap-ed device */
    char *curr_buf = buf;
    char to_write[4] = {0xca, 0xfe, 0xbe, 0xef};
    while (curr_buf < (char *)buf + WRITE_AMNT) {
        memcpy(curr_buf, to_write, 4);
        curr_buf += 4;
    }

    /* Unmap device */
    munmap(buf, WRITE_AMNT);

    /* Close device */
    fclose(fp);

    exit(EXIT_SUCCESS);
}
\end{minted}

% Any appendices, including any required ethics information, should be included
% after the references.

% Markers do not have to consider appendices. Make sure that your contributions
% are made clear in the main body of the dissertation (within the page limit).

% \chapter{Participants' information sheet}

% If you had human participants, include key information that they were given in
% an appendix, and point to it from the ethics declaration.

% \chapter{Participants' consent form}

% If you had human participants, include information about how consent was
% gathered in an appendix, and point to it from the ethics declaration.
% This information is often a copy of a consent form.


\end{document}