diff --git a/tex/draft/mybibfile.bib b/tex/draft/mybibfile.bib index 37bdcb7..1cfc66a 100644 --- a/tex/draft/mybibfile.bib +++ b/tex/draft/mybibfile.bib @@ -621,3 +621,10 @@ pages = {1--11}, year = {2014} } + +@book{Corbet_Rubini_K-Hartman.LDD3.2005, + title={Linux device drivers}, + author={Corbet, Jonathan and Rubini, Alessandro and Kroah-Hartman, Greg}, + year={2005}, + publisher={" O'Reilly Media, Inc."} +} diff --git a/tex/draft/skeleton.pdf b/tex/draft/skeleton.pdf index 6742133..46c6239 100644 Binary files a/tex/draft/skeleton.pdf and b/tex/draft/skeleton.pdf differ diff --git a/tex/draft/skeleton.tex b/tex/draft/skeleton.tex index 368fb37..2fbd2c5 100644 --- a/tex/draft/skeleton.tex +++ b/tex/draft/skeleton.tex @@ -21,12 +21,12 @@ \usepackage{biblatex} \addbibresource{mybibfile.bib} % <- biblatex -% -> nice definition listings +% -> definition & quotes \usepackage{csquotes} \usepackage{amsthm} \theoremstyle{definition} \newtheorem{definition}{Definition} -% <- definition +% <- definition & quotes % -> code listing % [!] Requires external program: pypi:pygment \usepackage{minted} @@ -364,7 +364,7 @@ Notably, kernel (driver) programming warrants programmer attention to software-m However, it does not preclude CPU store reordering, so memory barriers remain necessary in a multiprocessing context. } } - \item { + \item {\label{def:streaming-dma-map} \textit{Streaming} DMA mappings: They provide no guarantee to coherency in-between concurrent CPU/DMA accesses. Programmers need to manually apply coherency maintenance subroutines for synchronization. @@ -597,7 +597,7 @@ The primary source of experimental data come from a virtualized machine: a virtu \centering \begin{tabular}{|c|c|} \hline - Processors & AMD Ryzen 7 4800HS (8-core, 2-way SMT) \\ + Processors & AMD Ryzen 7 4800HS (8 $\times$ 2-way SMT) \\ \hline Freuqnecy & 2.9 GHz (4.2 GHz Turbo) \\ \hline @@ -670,7 +670,7 @@ In order to convert \texttt{dcache\_clean\_poc} to a traceable equivalent, a wra void __dcache_clean_poc(ulong start, ulong end) { - dcache_clean_poc(start, end); // see $\ref{code:dcache_clean_poc}$ + dcache_clean_poc(start, end); // see $\hyperref[code:dcache_clean_poc]{\texttt{arch/arm64/mm/cache.S}}$ } EXPORT_SYMBOL(__dcache_clean_poc); \end{minted} @@ -683,7 +683,144 @@ To simulate module-initiated cache coherence behavior over allocated kernel buff \subsubsection{\texttt{my\_shmem}: Design} The \texttt{my\_shmem} module is a utility for (lazily) allocating one or more kernel-space pages, re-mapping them into the userspace for reading/writing operations, and invoking cache-coherency operations \emph{as if} accessed via DMA on unmap. +To emulate \hyperref[def:streaming-dma-map]{streaming DMA mapping} allocation, the module is designed to allocate memory directly from the \textit{page allocator}, as required by the kernel documentation's guideline, \textit{What Memory is DMA'able?}\cite{Miller_Henderson_Jelinek.Kernelv6.7-DMA_guide.2024}: +\begin{displayquote} + If you acquired your memory via the page allocator (i.e. \texttt{\_\_get\_free\_page*()}) or the generic memory allocators (i.e. \texttt{kmalloc()} or \texttt{kmem\_cache\_alloc()}) then you may DMA to/from that memory using the addresses returned from those routines. +\end{displayquote} + +To enable page sharing between user-space processes, the module implements a allocation accounting mechanism for re-mapping existing allocations to multiple user-space address spaces on-demand. Specifically, it involves: +\begin{itemize} + \item { + Allocation of contiguous pages to some user-specified order (i.e., $2^{order}$ pages). + } + \item { + Correct re-mapping behavior of existing allocations, for example computing the correct offset when re-mapping a multi-page allocation during any given page-fault, which may not be aligned with the first page in the allocation. + } + \item { + Software cache coherency maintenance on removal of mapping from any user-space program. This is intended to simulate the behavior of DMA API in a system without any specific DMA hardware. + } +\end{itemize} + +The module should hence support userspace programs to be able to perform as follows: +\begin{enumerate} + \item { + Open the ``device'' file as exposed by the kernel module. + } + \item { + \texttt{mmap} on the opened file descriptor, as per POSIX syscall API. + } + \item { + Allocate memory due to load/store actions within the \texttt{mmap}-ed memory mapping. + } + \item { + Close the memory mapping, which initiates a simulated software cache coherency maintenance operation. + } +\end{enumerate} + \subsubsection{\texttt{my\_shmem}: Implementation} +To implement the features as specified, \texttt{my\_shmem} exposes itself as a character device file \texttt{/dev/my\_shmem}; implements \textit{file operations} \texttt{open}, \texttt{mmap}, and \texttt{release}; and implements \textit{vm operations} \texttt{close} and \texttt{fault}. + +Additionally, the parameter \texttt{max\_contiguous\_alloc\_order} is exposed as a writable parameter file inside \textit{sysfs} to manually control the number of contiguous pages allocated per module allocation. + +\paragraph{Static Data} \dots + +\paragraph{File Operations} +The Linux kernel defines \textit{file operations} as a series of module-specific callbacks whenever the userspace invokes a corresponding syscall on the (character) device file. These callbacks may be declared inside a \texttt{file\_operations} struct\cite{Corbet_Rubini_K-Hartman.LDD3.2005}, which provides an interface for modules on file-related syscalls: +\begin{minted}[linenos, bgcolor=code-bg, mathescape]{c} +/* In include/linux/fs.h */ +struct file_operations { + struct module *owner; + /* ... */ + int (*mmap) ( + struct file *, // opened (device) file + struct vm_area_struct * // kernel repr of mapping + ); // Downstream of syscall: mmap + /* ... */ + int (*open) ( + struct inode *, // inode of file to be opened + struct file * // opened (generic) file + ); // Downstream of libc: open + /* ... */ + int (*release) ( + struct inode *, // inode of file to be closed + struct file * // to be closed + ); // Downstream of libc: close + /* ... */ +} __randomize_layout; +\end{minted} + +The corresponding structure for the particular module is hence defined as follows: +\begin{minted}[linenos, bgcolor=code-bg, mathescape]{c} +/* In my_shmem.c */ +static const struct file_operations my_shmem_fops = { + .owner = THIS_MODULE, + .open = my_shmem_fops_open, + .mmap = my_shmem_fops_mmap, + .release = my_shmem_fops_release, +}; +\end{minted} + +Implementation of \texttt{.open} is simple. It suffices to install the module-specific \texttt{struct file\_operations} (i.e., \texttt{my\_shmem\_fops}) into the \texttt{struct file} passed in argument, which is constructed downstream via kernel's generic file opening mechanisms. + +Likewise for \texttt{.release}, which does nothing except to print a debug message into the kernel ring buffer. + +To implement \texttt{.mmap}, the kernel module attempts to \emph{re-map as much allocations into the given \texttt{struct vm\_area\_struct} as possible without making any allocation}. This centralizes allocation logic into the page fault handler, which is described later in \textcolor{red}{???}: +\begin{minted}[linenos, bgcolor=code-bg, mathescape]{c} +static int my_shmem_fops_mmap( + struct file *filp, + struct vm_area_struct *vma +) { + int ret = 0; + const ulong vma_pg_count = + (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + struct page *pg; + ulong tgt_addr = vma->vm_start; // Current remap target addr + ulong src_head_pfn; // Current remap source: head PFN + ulong src_pg_nr; // Current remap source: length + ulong vma_remainder_count = vma_pg_count; // vma: remain pgs + + /* Lock mutex... */ + /* Iterate over allocations, remap as much as possible */ + struct my_shmem_alloc *curr; + list_for_each_entry(curr, &my_shmem_allocs, list) { + /* exit if all of vma is mapped */ + if (tgt_addr >= vma->vm_end) + break; + + /* decrement page offset until alloc intersects */ + if (vma_pgoff > ORDER_TO_PAGE_NR(curr->alloc_order)) { + vma_pgoff -= ORDER_TO_PAGE_NR(curr->alloc_order); + continue; + } + + /* intersects, hence compute PFN to remap */ + pg = curr->page; + get_page(pg); // increment alloc. refcount + src_head_pfn = page_to_pfn(pg) + vma_pgoff; + src_pg_nr = min( + vma_remainder_count, + ORDER_TO_PAGE_NR(curr->alloc_order) - vma_pgoff + ); + ret = remap_pfn_range( + vma, // remap target VM area + tgt_addr, // page-aligned tgt addr + src_head_pfn, // kernel PFN as source + src_pg_nr * PAGE_SIZE, // size of remap region + vma->vm_page_prot, // page protection flags + ); + /* if (ret): goto error handling... */ + /* Prepare for next iteration */ + tgt_addr += src_pg_nr * PAGE_SIZE; + vma_remainder_count -= src_pg_nr; + } + + /* return or error handling... */ +} +\end{minted} + +\paragraph{VM Operations} \dots + +\paragraph{\textit{sysfs} Parameter} \dots \subsection{Instrumentation: \texttt{ftrace} and \textit{eBPF}} \subsection{Userspace Programs}