Expanded visualizer; filled table in report
This commit is contained in:
parent
8e4c770ff6
commit
0c2c3a045a
5 changed files with 99 additions and 8 deletions
|
|
@ -33,3 +33,14 @@ parser.add_argument(
|
||||||
"-i", "--interactive", type=bool, help="show pyplot when finished drawing",
|
"-i", "--interactive", type=bool, help="show pyplot when finished drawing",
|
||||||
default=True
|
default=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Customizability
|
||||||
|
parser.add_argument(
|
||||||
|
"--x-label", type=str, help="X axis label", default="Runtime (μs)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--y-label", type=str, help="Y axis label", default="Write amount (KiB)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--gen-title", type=bool, help="Generate title (legacy)", default=True
|
||||||
|
)
|
||||||
|
|
@ -3,6 +3,7 @@ from argparse import Namespace
|
||||||
import glob
|
import glob
|
||||||
from typing import List, Tuple, Union, Optional
|
from typing import List, Tuple, Union, Optional
|
||||||
import re
|
import re
|
||||||
|
import sys # stderr
|
||||||
|
|
||||||
import trappy
|
import trappy
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
@ -92,9 +93,17 @@ def run(args: Namespace):
|
||||||
if fngraph_df is None:
|
if fngraph_df is None:
|
||||||
print("No entry -- pass")
|
print("No entry -- pass")
|
||||||
continue
|
continue
|
||||||
size_in_kb = parse_basename_to_kibs(
|
try:
|
||||||
str(path.basename(ftrace_dat_path).split(".")[:-1]))
|
label = parse_basename_to_kibs(
|
||||||
ftrace_dfs.append((size_in_kb, fngraph_df))
|
path.basename(ftrace_dat_path).split(".")[:-1])
|
||||||
|
except Exception:
|
||||||
|
label = str(path.basename(ftrace_dat_path))
|
||||||
|
print(
|
||||||
|
f"Cannot parse size in basename '{label}', using basename...",
|
||||||
|
file=sys.stderr
|
||||||
|
)
|
||||||
|
|
||||||
|
ftrace_dfs.append((label, fngraph_df))
|
||||||
|
|
||||||
ftrace_dfs.sort(key=lambda tp: tp[0])
|
ftrace_dfs.sort(key=lambda tp: tp[0])
|
||||||
|
|
||||||
|
|
@ -109,16 +118,27 @@ def run(args: Namespace):
|
||||||
plot_df = pd.concat(series, axis=1, keys=indices) # In long-form
|
plot_df = pd.concat(series, axis=1, keys=indices) # In long-form
|
||||||
print("Plot shape: {}; head: \n{}"
|
print("Plot shape: {}; head: \n{}"
|
||||||
.format(plot_df.shape, plot_df.head()))
|
.format(plot_df.shape, plot_df.head()))
|
||||||
|
print("Quantiles | 25p: \n{} | 50p: \n{} | 75p: \n{} | 99p: \n{}".format(
|
||||||
|
plot_df.quantile(.25, interpolation="nearest"),
|
||||||
|
plot_df.quantile(.50, interpolation="nearest"),
|
||||||
|
plot_df.quantile(.75, interpolation="nearest"),
|
||||||
|
plot_df.quantile(.99, interpolation="nearest"),
|
||||||
|
))
|
||||||
|
|
||||||
# plot violin plot
|
# plot violin plot
|
||||||
fig, ax = plt.subplots(figsize=(12, 8))
|
fig, ax = plt.subplots(figsize=(12, 8))
|
||||||
sns.violinplot(
|
sns.violinplot(
|
||||||
data=plot_df, orient="h", fill=False, log_scale=args.use_log_scale,
|
data=plot_df, orient="h", fill=False, log_scale=args.use_log_scale,
|
||||||
inner="quart")
|
inner="quart")
|
||||||
ax.set_xlabel("Runtime (μs)")
|
ax.set_xlabel(args.x_label)
|
||||||
ax.set_ylabel("Write amount (KiB)")
|
ax.set_ylabel(args.y_label)
|
||||||
ax.set_title("Percentile: {}{}"
|
if args.gen_title:
|
||||||
.format(args.percentile, ", log scale" if args.use_log_scale else ""))
|
ax.set_title("Percentile: {}{}"
|
||||||
|
.format(
|
||||||
|
args.percentile,
|
||||||
|
", log scale" if args.use_log_scale else ""
|
||||||
|
)
|
||||||
|
)
|
||||||
fig.savefig(args.output)
|
fig.savefig(args.output)
|
||||||
|
|
||||||
if args.interactive:
|
if args.interactive:
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -548,7 +548,7 @@ static void recv_done(
|
||||||
\end{minted}
|
\end{minted}
|
||||||
|
|
||||||
\chapter{Software Coherency Latency}
|
\chapter{Software Coherency Latency}
|
||||||
Coherency must be maintained at software level when hardware cache coherency cannot be guaranteed for some specific ISA (as established in subsection \ref{subsec:armv8a-swcoherency}). There is, therefore, interest in knowing the latency of coherence-maintenance operations for performance engineering purposes, for example OS jitter analysis for scientific computing in heterogeneous clusters and, more pertinently, comparative analysis between software and hardware-backed DSM systems (e.g. \cites{Masouros_etal.Adrias.2023}{Wang_etal.Concordia.2021}).
|
Coherency must be maintained at software level when hardware cache coherency cannot be guaranteed for some specific ISA (as established in subsection \ref{subsec:armv8a-swcoherency}). There is, therefore, interest in knowing the latency of coherence-maintenance operations for performance engineering purposes, for example OS jitter analysis for scientific computing in heterogeneous clusters and, more pertinently, comparative analysis between software and hardware-backed DSM systems (e.g. \cites{Masouros_etal.Adrias.2023}{Wang_etal.Concordia.2021}). Such an analysis is crucial to being well-informed when designing a cross-architectural DSM system over RDMA.
|
||||||
|
|
||||||
The purpose of this chapter is hence to provide a statistical analysis over software coherency latency in ARM64 systems by instrumenting hypothetical scenarios of software-initiated coherence maintenance in ARM64 test-benches.
|
The purpose of this chapter is hence to provide a statistical analysis over software coherency latency in ARM64 systems by instrumenting hypothetical scenarios of software-initiated coherence maintenance in ARM64 test-benches.
|
||||||
|
|
||||||
|
|
@ -732,6 +732,8 @@ To implement the features as specified, \texttt{my\_shmem} exposes itself as a c
|
||||||
|
|
||||||
Additionally, the parameter \texttt{max\_contiguous\_alloc\_order} is exposed as a writable parameter file inside \textit{sysfs} to manually control the number of contiguous pages allocated per module allocation.
|
Additionally, the parameter \texttt{max\_contiguous\_alloc\_order} is exposed as a writable parameter file inside \textit{sysfs} to manually control the number of contiguous pages allocated per module allocation.
|
||||||
|
|
||||||
|
The entire kernel module used for experiment amount to around 400 lines of kernel-space code.
|
||||||
|
|
||||||
\paragraph*{Data Structures} \label{para:data-structs}
|
\paragraph*{Data Structures} \label{para:data-structs}
|
||||||
The primary functions of \texttt{my\_shmem} is to provide correct accounting of current allocations via the kernel module in addition to allocating on-demand. Hence, to represent a in-kernel allocation of multi-page contiguous buffer, define \texttt{struct my\_shmem\_alloc} as follows:
|
The primary functions of \texttt{my\_shmem} is to provide correct accounting of current allocations via the kernel module in addition to allocating on-demand. Hence, to represent a in-kernel allocation of multi-page contiguous buffer, define \texttt{struct my\_shmem\_alloc} as follows:
|
||||||
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
|
\begin{minted}[linenos, mathescape, bgcolor=code-bg]{c}
|
||||||
|
|
@ -1067,6 +1069,57 @@ We also conduct experiments over software coherency operations latencies over fi
|
||||||
\label{fig:coherency-op-multi-page-alloc}
|
\label{fig:coherency-op-multi-page-alloc}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{|c|c c c c|}
|
||||||
|
\hline
|
||||||
|
Order
|
||||||
|
& 25p
|
||||||
|
& 50p (Median)
|
||||||
|
& 75p
|
||||||
|
& 99p \\
|
||||||
|
\hline
|
||||||
|
0
|
||||||
|
& 5.968
|
||||||
|
& 9.808
|
||||||
|
& 15.808
|
||||||
|
& 58.464 \\
|
||||||
|
\hline
|
||||||
|
2
|
||||||
|
& 8.960
|
||||||
|
& 13.152
|
||||||
|
& 17.776
|
||||||
|
& 39.184 \\
|
||||||
|
\hline
|
||||||
|
4
|
||||||
|
& 19.216
|
||||||
|
& 21.120
|
||||||
|
& 23.648
|
||||||
|
& 123.984 \\
|
||||||
|
\hline
|
||||||
|
6
|
||||||
|
& 67.376
|
||||||
|
& 70.352
|
||||||
|
& 74.304
|
||||||
|
& 103.120 \\
|
||||||
|
\hline
|
||||||
|
8
|
||||||
|
& 278.784
|
||||||
|
& 303.136
|
||||||
|
& 324.048
|
||||||
|
& 1783.008 \\
|
||||||
|
\hline
|
||||||
|
10
|
||||||
|
& 1050.752
|
||||||
|
& 1141.312
|
||||||
|
& 1912.576
|
||||||
|
& 2325.104 \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Coherency op latency of Variable-order Contiguous Allocation. Time listed in {\mu}s. $N = 100$ across allocation orders.}
|
||||||
|
\label{table:coherency-op-multi-page-alloc}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
\section{Discussion}\label{sec:sw-coherency-discuss}
|
\section{Discussion}\label{sec:sw-coherency-discuss}
|
||||||
Figures \ref{fig:coherency-op-per-page-alloc}, \ref{fig:coherency-op-multi-page-alloc} exhibits that, in general, coherency maintenance operation is \textbf{unrelated with the size of the mapped memory area} and \textbf{correlated with how large a single contiguous allocation is made}. We especially note that the runtime of each software-initiated coherency maintenance operation \textbf{does not grow linearly with allocation size}. Given that both axis of figure \ref{fig:coherency-op-multi-page-alloc} is on a log-scale, with the ``order'' axis interpretable as a ${\log}_2$ scale of number of contiguous 4K pages, a perfect linear correlation between allocation size and latency would see a roughly linear interpolation between the data points. This is obviously not the case for figure \ref{fig:coherency-op-multi-page-alloc}, which sees software coherency operation latency increasing drastically once order $\ge$ 6 (i.e., 64 contiguous pages), but remain roughly comparable for smaller orders.
|
Figures \ref{fig:coherency-op-per-page-alloc}, \ref{fig:coherency-op-multi-page-alloc} exhibits that, in general, coherency maintenance operation is \textbf{unrelated with the size of the mapped memory area} and \textbf{correlated with how large a single contiguous allocation is made}. We especially note that the runtime of each software-initiated coherency maintenance operation \textbf{does not grow linearly with allocation size}. Given that both axis of figure \ref{fig:coherency-op-multi-page-alloc} is on a log-scale, with the ``order'' axis interpretable as a ${\log}_2$ scale of number of contiguous 4K pages, a perfect linear correlation between allocation size and latency would see a roughly linear interpolation between the data points. This is obviously not the case for figure \ref{fig:coherency-op-multi-page-alloc}, which sees software coherency operation latency increasing drastically once order $\ge$ 6 (i.e., 64 contiguous pages), but remain roughly comparable for smaller orders.
|
||||||
|
|
||||||
|
|
@ -1117,6 +1170,13 @@ Recall from section \ref{subsec:armv8a-swcoherency} that ARMv8-A defines \textit
|
||||||
Further studies are necessary to examine the latency after coherency maintenance operations on ARMv8 architectures on various systems, including access from DMA engine vs. access from CPU, etc.
|
Further studies are necessary to examine the latency after coherency maintenance operations on ARMv8 architectures on various systems, including access from DMA engine vs. access from CPU, etc.
|
||||||
|
|
||||||
\subsection{Reflection}
|
\subsection{Reflection}
|
||||||
|
% Bad visualization work on 2, arguably more instructive to DSM design. THis is because ex.2 is an afterthought to ex.1 and is conducted without sufficient time for proper data analysis -- ftrace takes time to analyze and visualize, notably. Maybe add a ftraced max-min etc. table!
|
||||||
|
|
||||||
|
% Bad analysis on whether this really emulates anything. It may be of no significance right now (as we are solely concerned w/ software latency)
|
||||||
|
|
||||||
|
% Should experiment over a variety of hardware. rose is system-ready which supports HW coherency, so prob. not reflective of anything real. Maybe take a raspberry pi now that they have PCIe. Regardless, ARM with PCIe without system-readyness is growing, so may be of more significance in future?
|
||||||
|
|
||||||
|
% Note the difference in magnitudes in latency. This may be because of whether laptop is plugged or not. Admit your mistake and lament that you should really really really used a separate hardware with reliable energy source for these data. Note on the otherhand that the growth rate remains consistent whether plugged or not.
|
||||||
|
|
||||||
|
|
||||||
\chapter{Conclusion}
|
\chapter{Conclusion}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue