diff --git a/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/arguments.py b/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/arguments.py index f9fc0a8..a1e626e 100644 --- a/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/arguments.py +++ b/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/arguments.py @@ -32,4 +32,15 @@ parser.add_argument( parser.add_argument( "-i", "--interactive", type=bool, help="show pyplot when finished drawing", default=True +) + +# Customizability +parser.add_argument( + "--x-label", type=str, help="X axis label", default="Runtime (μs)" +) +parser.add_argument( + "--y-label", type=str, help="Y axis label", default="Write amount (KiB)" +) +parser.add_argument( + "--gen-title", type=bool, help="Generate title (legacy)", default=True ) \ No newline at end of file diff --git a/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/core.py b/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/core.py index 1f4f890..d68b351 100644 --- a/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/core.py +++ b/src/aarch64-linux-flush-dcache/visualizer/ftrace2pyplot/core.py @@ -3,6 +3,7 @@ from argparse import Namespace import glob from typing import List, Tuple, Union, Optional import re +import sys # stderr import trappy import pandas as pd @@ -92,9 +93,17 @@ def run(args: Namespace): if fngraph_df is None: print("No entry -- pass") continue - size_in_kb = parse_basename_to_kibs( - str(path.basename(ftrace_dat_path).split(".")[:-1])) - ftrace_dfs.append((size_in_kb, fngraph_df)) + try: + label = parse_basename_to_kibs( + path.basename(ftrace_dat_path).split(".")[:-1]) + except Exception: + label = str(path.basename(ftrace_dat_path)) + print( + f"Cannot parse size in basename '{label}', using basename...", + file=sys.stderr + ) + + ftrace_dfs.append((label, fngraph_df)) ftrace_dfs.sort(key=lambda tp: tp[0]) @@ -109,16 +118,27 @@ def run(args: Namespace): plot_df = pd.concat(series, axis=1, keys=indices) # In long-form print("Plot shape: {}; head: \n{}" .format(plot_df.shape, plot_df.head())) + print("Quantiles | 25p: \n{} | 50p: \n{} | 75p: \n{} | 99p: \n{}".format( + plot_df.quantile(.25, interpolation="nearest"), + plot_df.quantile(.50, interpolation="nearest"), + plot_df.quantile(.75, interpolation="nearest"), + plot_df.quantile(.99, interpolation="nearest"), + )) # plot violin plot fig, ax = plt.subplots(figsize=(12, 8)) sns.violinplot( data=plot_df, orient="h", fill=False, log_scale=args.use_log_scale, inner="quart") - ax.set_xlabel("Runtime (μs)") - ax.set_ylabel("Write amount (KiB)") - ax.set_title("Percentile: {}{}" - .format(args.percentile, ", log scale" if args.use_log_scale else "")) + ax.set_xlabel(args.x_label) + ax.set_ylabel(args.y_label) + if args.gen_title: + ax.set_title("Percentile: {}{}" + .format( + args.percentile, + ", log scale" if args.use_log_scale else "" + ) + ) fig.savefig(args.output) if args.interactive: diff --git a/src/aarch64-linux-flush-dcache/visualizer/out.pdf b/src/aarch64-linux-flush-dcache/visualizer/out.pdf index 4770386..a385cef 100644 Binary files a/src/aarch64-linux-flush-dcache/visualizer/out.pdf and b/src/aarch64-linux-flush-dcache/visualizer/out.pdf differ diff --git a/tex/draft/skeleton.pdf b/tex/draft/skeleton.pdf index a12292b..90f1062 100644 Binary files a/tex/draft/skeleton.pdf and b/tex/draft/skeleton.pdf differ diff --git a/tex/draft/skeleton.tex b/tex/draft/skeleton.tex index 4a7c2d7..7d8857e 100644 --- a/tex/draft/skeleton.tex +++ b/tex/draft/skeleton.tex @@ -548,7 +548,7 @@ static void recv_done( \end{minted} \chapter{Software Coherency Latency} -Coherency must be maintained at software level when hardware cache coherency cannot be guaranteed for some specific ISA (as established in subsection \ref{subsec:armv8a-swcoherency}). There is, therefore, interest in knowing the latency of coherence-maintenance operations for performance engineering purposes, for example OS jitter analysis for scientific computing in heterogeneous clusters and, more pertinently, comparative analysis between software and hardware-backed DSM systems (e.g. \cites{Masouros_etal.Adrias.2023}{Wang_etal.Concordia.2021}). +Coherency must be maintained at software level when hardware cache coherency cannot be guaranteed for some specific ISA (as established in subsection \ref{subsec:armv8a-swcoherency}). There is, therefore, interest in knowing the latency of coherence-maintenance operations for performance engineering purposes, for example OS jitter analysis for scientific computing in heterogeneous clusters and, more pertinently, comparative analysis between software and hardware-backed DSM systems (e.g. \cites{Masouros_etal.Adrias.2023}{Wang_etal.Concordia.2021}). Such an analysis is crucial to being well-informed when designing a cross-architectural DSM system over RDMA. The purpose of this chapter is hence to provide a statistical analysis over software coherency latency in ARM64 systems by instrumenting hypothetical scenarios of software-initiated coherence maintenance in ARM64 test-benches. @@ -732,6 +732,8 @@ To implement the features as specified, \texttt{my\_shmem} exposes itself as a c Additionally, the parameter \texttt{max\_contiguous\_alloc\_order} is exposed as a writable parameter file inside \textit{sysfs} to manually control the number of contiguous pages allocated per module allocation. +The entire kernel module used for experiment amount to around 400 lines of kernel-space code. + \paragraph*{Data Structures} \label{para:data-structs} The primary functions of \texttt{my\_shmem} is to provide correct accounting of current allocations via the kernel module in addition to allocating on-demand. Hence, to represent a in-kernel allocation of multi-page contiguous buffer, define \texttt{struct my\_shmem\_alloc} as follows: \begin{minted}[linenos, mathescape, bgcolor=code-bg]{c} @@ -1067,6 +1069,57 @@ We also conduct experiments over software coherency operations latencies over fi \label{fig:coherency-op-multi-page-alloc} \end{figure} +\begin{table}[h] + \centering + \begin{tabular}{|c|c c c c|} + \hline + Order + & 25p + & 50p (Median) + & 75p + & 99p \\ + \hline + 0 + & 5.968 + & 9.808 + & 15.808 + & 58.464 \\ + \hline + 2 + & 8.960 + & 13.152 + & 17.776 + & 39.184 \\ + \hline + 4 + & 19.216 + & 21.120 + & 23.648 + & 123.984 \\ + \hline + 6 + & 67.376 + & 70.352 + & 74.304 + & 103.120 \\ + \hline + 8 + & 278.784 + & 303.136 + & 324.048 + & 1783.008 \\ + \hline + 10 + & 1050.752 + & 1141.312 + & 1912.576 + & 2325.104 \\ + \hline + \end{tabular} + \caption{Coherency op latency of Variable-order Contiguous Allocation. Time listed in {\mu}s. $N = 100$ across allocation orders.} + \label{table:coherency-op-multi-page-alloc} +\end{table} + \section{Discussion}\label{sec:sw-coherency-discuss} Figures \ref{fig:coherency-op-per-page-alloc}, \ref{fig:coherency-op-multi-page-alloc} exhibits that, in general, coherency maintenance operation is \textbf{unrelated with the size of the mapped memory area} and \textbf{correlated with how large a single contiguous allocation is made}. We especially note that the runtime of each software-initiated coherency maintenance operation \textbf{does not grow linearly with allocation size}. Given that both axis of figure \ref{fig:coherency-op-multi-page-alloc} is on a log-scale, with the ``order'' axis interpretable as a ${\log}_2$ scale of number of contiguous 4K pages, a perfect linear correlation between allocation size and latency would see a roughly linear interpolation between the data points. This is obviously not the case for figure \ref{fig:coherency-op-multi-page-alloc}, which sees software coherency operation latency increasing drastically once order $\ge$ 6 (i.e., 64 contiguous pages), but remain roughly comparable for smaller orders. @@ -1117,6 +1170,13 @@ Recall from section \ref{subsec:armv8a-swcoherency} that ARMv8-A defines \textit Further studies are necessary to examine the latency after coherency maintenance operations on ARMv8 architectures on various systems, including access from DMA engine vs. access from CPU, etc. \subsection{Reflection} +% Bad visualization work on 2, arguably more instructive to DSM design. THis is because ex.2 is an afterthought to ex.1 and is conducted without sufficient time for proper data analysis -- ftrace takes time to analyze and visualize, notably. Maybe add a ftraced max-min etc. table! + +% Bad analysis on whether this really emulates anything. It may be of no significance right now (as we are solely concerned w/ software latency) + +% Should experiment over a variety of hardware. rose is system-ready which supports HW coherency, so prob. not reflective of anything real. Maybe take a raspberry pi now that they have PCIe. Regardless, ARM with PCIe without system-readyness is growing, so may be of more significance in future? + +% Note the difference in magnitudes in latency. This may be because of whether laptop is plugged or not. Admit your mistake and lament that you should really really really used a separate hardware with reliable energy source for these data. Note on the otherhand that the growth rate remains consistent whether plugged or not. \chapter{Conclusion}