diff --git a/scratch/00-task_struct.md b/scratch/00-task_struct.md new file mode 100644 index 0000000..cf9c952 --- /dev/null +++ b/scratch/00-task_struct.md @@ -0,0 +1,873 @@ +# `#include ` +```c +struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* + * For reasons of header soup (see current_thread_info()), this + * must be the first element of task_struct. + */ + // Zk. + // Specifically this is because of a circular dependency (header file hell), + // The hack-around is to type-cast this struct directly into `struct thread_info` + // -- I know I know this is f**ked up. + struct thread_info thread_info; +#endif + unsigned int __state; + + /* saved state for "spinlock sleepers" */ + unsigned int saved_state; + + /* + * This begins the randomizable portion of task_struct. Only + * scheduling-critical items should be added above here. + */ + randomized_struct_fields_start + + void *stack; + refcount_t usage; + /* Per task flags (PF_*), defined further below: */ + unsigned int flags; + unsigned int ptrace; + +#ifdef CONFIG_SMP // This should beg no explanation... + int on_cpu; + struct __call_single_node wake_entry; + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can + * push tasks around a CPU where each wakeup moves to the next one. + * Tracking a recently used CPU allows a quick search for a recently + * used CPU that may be idle. + */ + int recent_used_cpu; + int wake_cpu; +#endif + + // Zk. + // "on runqueue" -- contains flag value. + int on_rq; + + // Zk. + // Priorities -- rt and normal priority. + int prio; + int static_prio; + int normal_prio; + unsigned int rt_priority; + + // Zk. + // Information of task as a scheduling entity. + struct sched_entity se; + struct sched_rt_entity rt; + struct sched_dl_entity dl; + const struct sched_class *sched_class; + +#ifdef CONFIG_SCHED_CORE +// Zk. +// Core Scheduling -- allows userspace to define tasks that CAN share cores. +// Interesting case is for "cannot"-ones -- +// remember SMT speculative exec. security issues? + struct rb_node core_node; + unsigned long core_cookie; + unsigned int core_occupation; +#endif + +#ifdef CONFIG_CGROUP_SCHED + // Zk. + // Cgroup v1 -- divides CPU time fairly among `cgroup`s (and other stuff) + // This doesn't seem to be referring to task group, paradoxically... + struct task_group *sched_task_group; +#endif + +#ifdef CONFIG_UCLAMP_TASK + // Zk. + // `UCLAMP` -- Utilization CLAMPing. + // Allows the userspace to hint the performance req. of tasks. + // Since v5.3. cgroup support since v5.4. + /* + * Clamp values requested for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* + * Effective clamp values used for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + + struct sched_statistics stats; + +#ifdef CONFIG_PREEMPT_NOTIFIERS // Zk. -- This should always be defined. + /* List of struct preempt_notifier: */ + struct hlist_head preempt_notifiers; // Zk. -- hlist -- Hashed List +#endif + +#ifdef CONFIG_BLK_DEV_IO_TRACE + unsigned int btrace_seq; +#endif + + unsigned int policy; + int nr_cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t *user_cpus_ptr; + cpumask_t cpus_mask; + void *migration_pending; +#ifdef CONFIG_SMP + unsigned short migration_disabled; +#endif + unsigned short migration_flags; + +#ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; + union rcu_special rcu_read_unlock_special; + struct list_head rcu_node_entry; + struct rcu_node *rcu_blocked_node; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + +#ifdef CONFIG_TASKS_RCU + unsigned long rcu_tasks_nvcsw; + u8 rcu_tasks_holdout; + u8 rcu_tasks_idx; + int rcu_tasks_idle_cpu; + struct list_head rcu_tasks_holdout_list; +#endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_TRACE_RCU + int trc_reader_nesting; + int trc_ipi_to_cpu; + union rcu_special trc_reader_special; + struct list_head trc_holdout_list; + struct list_head trc_blkd_node; + int trc_blkd_cpu; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + + struct sched_info sched_info; + + struct list_head tasks; +#ifdef CONFIG_SMP + struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; +#endif + // Zk. + // So -- we know there are kernel threads and userspace threads, right? + // Note that kernel address space is ALWAYS mapped in whichever userspace + // address space (mostly low-mem which paradoxically is when byte nr. is + // large) -- albeit with KASLR it's not identical. + // + // Anyways, consequently kernel threads DO NOT need to swap address space. + // That is, we can reuse the same `mm_struct` for kernel threads when sched. + // We still, however, need to differentiate between these 2 cases (addr. space + // is true & valid vs. addr. space is stale & nonsense). + // + // This field, `task_struct.mm`, is GUARANTEED to point to NULL (for anon. + // proc.) or the *real* address space of the thread. + struct mm_struct *mm; + + // Zk. + // Vice versa, `task_struct.active_mm` is GUARANTEED to point to the + // `mm_struct` currently in use, whether loaded or borrowed. + struct mm_struct *active_mm; + struct address_space *faults_disabled_mapping; + + int exit_state; + int exit_code; + int exit_signal; + /* The signal sent when the parent dies: */ + int pdeath_signal; + /* JOBCTL_*, siglock protected: */ + unsigned long jobctl; + + /* Used for emulating ABI behavior of previous Linux versions: */ + unsigned int personality; + + /* Scheduler bits, serialized by scheduler locks: */ + unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; + + /* Force alignment to the next boundary: */ + unsigned :0; + + /* Unserialized, strictly 'current' */ + + /* + * This field must not be in the scheduler word above due to wakelist + * queueing no longer being serialized by p->on_cpu. However: + * + * p->XXX = X; ttwu() + * schedule() if (p->on_rq && ..) // false + * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true + * deactivate_task() ttwu_queue_wakelist()) + * p->on_rq = 0; p->sched_remote_wakeup = Y; + * + * guarantees all stores of 'current' are visible before + * ->sched_remote_wakeup gets used, so it can be in this word. + */ + unsigned sched_remote_wakeup:1; +#ifdef CONFIG_RT_MUTEXES + unsigned sched_rt_mutex:1; +#endif + + /* Bit to tell LSMs we're in execve(): */ + unsigned in_execve:1; + unsigned in_iowait:1; +#ifndef TIF_RESTORE_SIGMASK + unsigned restore_sigmask:1; +#endif +#ifdef CONFIG_MEMCG + unsigned in_user_fault:1; +#endif +#ifdef CONFIG_LRU_GEN + /* whether the LRU algorithm may apply to this access */ + unsigned in_lru_fault:1; +#endif +#ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; +#endif +#ifdef CONFIG_CGROUPS + /* disallow userland-initiated cgroup migration */ + unsigned no_cgroup_migration:1; + /* task is frozen/stopped (used by the cgroup freezer) */ + unsigned frozen:1; +#endif +#ifdef CONFIG_BLK_CGROUP + unsigned use_memdelay:1; +#endif +#ifdef CONFIG_PSI + /* Stalled due to lack of memory */ + unsigned in_memstall:1; +#endif +#ifdef CONFIG_PAGE_OWNER + /* Used by page_owner=on to detect recursion in page tracking. */ + unsigned in_page_owner:1; +#endif +#ifdef CONFIG_EVENTFD + /* Recursion prevention for eventfd_signal() */ + unsigned in_eventfd:1; +#endif +#ifdef CONFIG_IOMMU_SVA + unsigned pasid_activated:1; +#endif +#ifdef CONFIG_CPU_SUP_INTEL + unsigned reported_split_lock:1; +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + /* delay due to memory thrashing */ + unsigned in_thrashing:1; +#endif + + unsigned long atomic_flags; /* Flags requiring atomic access. */ + + // Zk. + // Syscalls can be interrupted by signal delivery. For syscalls like + // `nanosleep()`, however, we don't have an easy way to know for how long + // to continue the sleep, as userspace simply re-calls the syscall with + // identical duration. + // + // The `restart_block` struct allows a restart handler to be used for some + // syscall. + struct restart_block restart_block; + + // Zk. -- ! + pid_t pid; + pid_t tgid; + +#ifdef CONFIG_STACKPROTECTOR + /* Canary value for the -fstack-protector GCC feature: */ + unsigned long stack_canary; +#endif + /* + * Pointers to the (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->real_parent->pid) + */ + + /* Real parent process: */ + struct task_struct __rcu *real_parent; + + /* Recipient of SIGCHLD, wait4() reports: */ + struct task_struct __rcu *parent; + + /* + * Children/sibling form the list of natural children: + */ + struct list_head children; + struct list_head sibling; + struct task_struct *group_leader; + + /* + * 'ptraced' is the list of tasks this task is using ptrace() on. + * + * This includes both natural children and PTRACE_ATTACH targets. + * 'ptrace_entry' is this task's link on the p->parent->ptraced list. + */ + struct list_head ptraced; + struct list_head ptrace_entry; + + /* PID/PID hash table linkage. */ + struct pid *thread_pid; + struct hlist_node pid_links[PIDTYPE_MAX]; + struct list_head thread_node; + + struct completion *vfork_done; + + /* CLONE_CHILD_SETTID: */ + int __user *set_child_tid; + + /* CLONE_CHILD_CLEARTID: */ + int __user *clear_child_tid; + + /* PF_KTHREAD | PF_IO_WORKER */ + void *worker_private; + + u64 utime; + u64 stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; +#endif + u64 gtime; + struct prev_cputime prev_cputime; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + struct vtime vtime; +#endif + +#ifdef CONFIG_NO_HZ_FULL + atomic_t tick_dep_mask; +#endif + /* Context switch counts: */ + unsigned long nvcsw; + unsigned long nivcsw; + + /* Monotonic time in nsecs: */ + u64 start_time; + + /* Boot based time in nsecs: */ + u64 start_boottime; + + /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ + unsigned long min_flt; + unsigned long maj_flt; + + /* Empty if CONFIG_POSIX_CPUTIMERS=n */ + struct posix_cputimers posix_cputimers; + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK + struct posix_cputimers_work posix_cputimers_work; +#endif + + /* Process credentials: */ + + /* Tracer's credentials at attach: */ + const struct cred __rcu *ptracer_cred; + + /* Objective and real subjective task credentials (COW): */ + const struct cred __rcu *real_cred; + + /* Effective (overridable) subjective task credentials (COW): */ + const struct cred __rcu *cred; + +#ifdef CONFIG_KEYS + /* Cached requested key. */ + struct key *cached_requested_key; +#endif + + /* + * executable name, excluding path. + * + * - normally initialized setup_new_exec() + * - access it with [gs]et_task_comm() + * - lock it with task_lock() + */ + char comm[TASK_COMM_LEN]; + + struct nameidata *nameidata; + +#ifdef CONFIG_SYSVIPC + struct sysv_sem sysvsem; + struct sysv_shm sysvshm; +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + unsigned long last_switch_count; + unsigned long last_switch_time; +#endif + /* Filesystem information: */ + struct fs_struct *fs; + + /* Open file information: */ + struct files_struct *files; + +#ifdef CONFIG_IO_URING + struct io_uring_task *io_uring; +#endif + + /* Namespaces: */ + struct nsproxy *nsproxy; + + /* Signal handlers: */ + struct signal_struct *signal; + struct sighand_struct __rcu *sighand; + sigset_t blocked; + sigset_t real_blocked; + /* Restored if set_restore_sigmask() was used: */ + sigset_t saved_sigmask; + struct sigpending pending; + unsigned long sas_ss_sp; + size_t sas_ss_size; + unsigned int sas_ss_flags; + + struct callback_head *task_works; + +#ifdef CONFIG_AUDIT +#ifdef CONFIG_AUDITSYSCALL + struct audit_context *audit_context; +#endif + kuid_t loginuid; + unsigned int sessionid; +#endif + struct seccomp seccomp; + + // Zk. + // wine, etc. want to emulate syscalls for a part of their process + // (the incompatible part) only. `syscall_user_dispatch` brings the ability + // to declare either to run syscalls natively or to emulate syscalls in user- + // space via SIGSYS in the userspace. + struct syscall_user_dispatch syscall_dispatch; + + /* Thread group tracking: */ + u64 parent_exec_id; + u64 self_exec_id; + + /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ + spinlock_t alloc_lock; + + /* Protection of the PI data structures: */ + raw_spinlock_t pi_lock; + + struct wake_q_node wake_q; + +#ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task: */ + struct rb_root_cached pi_waiters; + /* Updated under owner's pi_lock and rq lock */ + struct task_struct *pi_top_task; + /* Deadlock detection and priority inheritance handling: */ + struct rt_mutex_waiter *pi_blocked_on; +#endif + +#ifdef CONFIG_DEBUG_MUTEXES + /* Mutex deadlock detection: */ + struct mutex_waiter *blocked_on; +#endif + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + int non_block_count; +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS + struct irqtrace_events irqtrace; + unsigned int hardirq_threaded; + u64 hardirq_chain_key; + int softirqs_enabled; + int softirq_context; + int irq_config; +#endif +#ifdef CONFIG_PREEMPT_RT + int softirq_disable_cnt; +#endif + +#ifdef CONFIG_LOCKDEP +# define MAX_LOCK_DEPTH 48UL + u64 curr_chain_key; + int lockdep_depth; + unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; +#endif + +#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) + unsigned int in_ubsan; +#endif + + /* Journalling filesystem info: */ + void *journal_info; + + /* Stacked block device info: */ + struct bio_list *bio_list; + + /* Stack plugging: */ + struct blk_plug *plug; + + /* VM state: */ + struct reclaim_state *reclaim_state; + + struct io_context *io_context; + +#ifdef CONFIG_COMPACTION + struct capture_control *capture_control; +#endif + /* Ptrace state: */ + unsigned long ptrace_message; + kernel_siginfo_t *last_siginfo; + + struct task_io_accounting ioac; +#ifdef CONFIG_PSI + /* Pressure stall state */ + unsigned int psi_flags; +#endif +#ifdef CONFIG_TASK_XACCT + /* Accumulated RSS usage: */ + u64 acct_rss_mem1; + /* Accumulated virtual memory usage: */ + u64 acct_vm_mem1; + /* stime + utime since last update: */ + u64 acct_timexpd; +#endif +#ifdef CONFIG_CPUSETS + /* Protected by ->alloc_lock: */ + nodemask_t mems_allowed; + /* Sequence number to catch updates: */ + seqcount_spinlock_t mems_allowed_seq; + int cpuset_mem_spread_rotor; + int cpuset_slab_spread_rotor; +#endif +#ifdef CONFIG_CGROUPS + /* Control Group info protected by css_set_lock: */ + struct css_set __rcu *cgroups; + /* cg_list protected by css_set_lock and tsk->alloc_lock: */ + struct list_head cg_list; +#endif +#ifdef CONFIG_X86_CPU_RESCTRL + u32 closid; + u32 rmid; +#endif +#ifdef CONFIG_FUTEX + struct robust_list_head __user *robust_list; +#ifdef CONFIG_COMPAT + struct compat_robust_list_head __user *compat_robust_list; +#endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; + struct mutex futex_exit_mutex; + unsigned int futex_state; +#endif +#ifdef CONFIG_PERF_EVENTS + struct perf_event_context *perf_event_ctxp; + struct mutex perf_event_mutex; + struct list_head perf_event_list; +#endif +#ifdef CONFIG_DEBUG_PREEMPT + unsigned long preempt_disable_ip; +#endif +#ifdef CONFIG_NUMA + /* Protected by alloc_lock: */ + struct mempolicy *mempolicy; + short il_prev; + short pref_node_fork; +#endif +#ifdef CONFIG_NUMA_BALANCING + int numa_scan_seq; + unsigned int numa_scan_period; + unsigned int numa_scan_period_max; + int numa_preferred_nid; + unsigned long numa_migrate_retry; + /* Migration stamp: */ + u64 node_stamp; + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; + struct callback_head numa_work; + + /* + * This pointer is only modified for current in syscall and + * pagefault context (and for tasks being destroyed), so it can be read + * from any of the following contexts: + * - RCU read-side critical section + * - current->numa_group from everywhere + * - task's runqueue locked, task not running + */ + struct numa_group __rcu *numa_group; + + /* + * numa_faults is an array split into four regions: + * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer + * in this precise order. + * + * faults_memory: Exponential decaying average of faults on a per-node + * basis. Scheduling placement decisions are made based on these + * counts. The values remain static for the duration of a PTE scan. + * faults_cpu: Track the nodes the process was running on when a NUMA + * hinting fault was incurred. + * faults_memory_buffer and faults_cpu_buffer: Record faults per node + * during the current scan window. When the scan completes, the counts + * in faults_memory and faults_cpu decay and these values are copied. + */ + unsigned long *numa_faults; + unsigned long total_numa_faults; + + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[3]; + + unsigned long numa_pages_migrated; +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; + u32 rseq_sig; + /* + * RmW on rseq_event_mask must be performed atomically + * with respect to preemption. + */ + unsigned long rseq_event_mask; +#endif + +#ifdef CONFIG_SCHED_MM_CID + int mm_cid; /* Current cid in mm */ + int last_mm_cid; /* Most recent cid in mm */ + int migrate_from_cpu; + int mm_cid_active; /* Whether cid bitmap is active */ + struct callback_head cid_work; +#endif + + struct tlbflush_unmap_batch tlb_ubc; + + /* Cache last used pipe for splice(): */ + struct pipe_inode_info *splice_pipe; + + struct page_frag task_frag; + +#ifdef CONFIG_TASK_DELAY_ACCT + struct task_delay_info *delays; +#endif + +#ifdef CONFIG_FAULT_INJECTION + int make_it_fail; + unsigned int fail_nth; +#endif + /* + * When (nr_dirtied >= nr_dirtied_pause), it's time to call + * balance_dirty_pages() for a dirty throttling pause: + */ + int nr_dirtied; + int nr_dirtied_pause; + /* Start of a write-and-pause period: */ + unsigned long dirty_paused_when; + +#ifdef CONFIG_LATENCYTOP + int latency_record_count; + struct latency_record latency_record[LT_SAVECOUNT]; +#endif + /* + * Time slack values; these are used to round up poll() and + * select() etc timeout values. These are in nanoseconds. + */ + u64 timer_slack_ns; + u64 default_timer_slack_ns; + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + unsigned int kasan_depth; +#endif + +#ifdef CONFIG_KCSAN + struct kcsan_ctx kcsan_ctx; +#ifdef CONFIG_TRACE_IRQFLAGS + struct irqtrace_events kcsan_save_irqtrace; +#endif +#ifdef CONFIG_KCSAN_WEAK_MEMORY + int kcsan_stack_depth; +#endif +#endif + +#ifdef CONFIG_KMSAN + struct kmsan_ctx kmsan_ctx; +#endif + +#if IS_ENABLED(CONFIG_KUNIT) + struct kunit *kunit_test; +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Index of current stored address in ret_stack: */ + int curr_ret_stack; + int curr_ret_depth; + + /* Stack of return addresses for return function tracing: */ + struct ftrace_ret_stack *ret_stack; + + /* Timestamp for last schedule: */ + unsigned long long ftrace_timestamp; + + /* + * Number of functions that haven't been traced + * because of depth overrun: + */ + atomic_t trace_overrun; + + /* Pause tracing: */ + atomic_t tracing_graph_pause; +#endif + +#ifdef CONFIG_TRACING + /* Bitmask and counter of trace recursion: */ + unsigned long trace_recursion; +#endif /* CONFIG_TRACING */ + +#ifdef CONFIG_KCOV + /* See kernel/kcov.c for more details. */ + + /* Coverage collection mode enabled for this task (0 if disabled): */ + unsigned int kcov_mode; + + /* Size of the kcov_area: */ + unsigned int kcov_size; + + /* Buffer for coverage collection: */ + void *kcov_area; + + /* KCOV descriptor wired with this task or NULL: */ + struct kcov *kcov; + + /* KCOV common handle for remote coverage collection: */ + u64 kcov_handle; + + /* KCOV sequence number: */ + int kcov_sequence; + + /* Collect coverage from softirq context: */ + unsigned int kcov_softirq; +#endif + +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg_in_oom; + gfp_t memcg_oom_gfp_mask; + int memcg_oom_order; + + /* Number of pages to reclaim on returning to userland: */ + unsigned int memcg_nr_pages_over_high; + + /* Used by memcontrol for targeted memcg charge: */ + struct mem_cgroup *active_memcg; +#endif + +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *objcg; +#endif + +#ifdef CONFIG_BLK_CGROUP + struct gendisk *throttle_disk; +#endif + +#ifdef CONFIG_UPROBES + struct uprobe_task *utask; +#endif +#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) + unsigned int sequential_io; + unsigned int sequential_io_avg; +#endif + struct kmap_ctrl kmap_ctrl; +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +# ifdef CONFIG_PREEMPT_RT + unsigned long saved_state_change; +# endif +#endif + struct rcu_head rcu; + refcount_t rcu_users; + int pagefault_disabled; +#ifdef CONFIG_MMU + struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; +#endif +#ifdef CONFIG_VMAP_STACK + struct vm_struct *stack_vm_area; +#endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* A live task holds one reference: */ + refcount_t stack_refcount; +#endif +#ifdef CONFIG_LIVEPATCH + int patch_state; +#endif +#ifdef CONFIG_SECURITY + /* Used by LSM modules for access restriction: */ + void *security; +#endif +#ifdef CONFIG_BPF_SYSCALL + /* Used by BPF task local storage */ + struct bpf_local_storage __rcu *bpf_storage; + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; +#endif + +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + unsigned long lowest_stack; + unsigned long prev_lowest_stack; +#endif + +#ifdef CONFIG_X86_MCE + void __user *mce_vaddr; + __u64 mce_kflags; + u64 mce_addr; + __u64 mce_ripv : 1, + mce_whole_page : 1, + __mce_reserved : 62; + struct callback_head mce_kill_me; + int mce_count; +#endif + +#ifdef CONFIG_KRETPROBES + struct llist_head kretprobe_instances; +#endif +#ifdef CONFIG_RETHOOK + struct llist_head rethooks; +#endif + +#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH + /* + * If L1D flush is supported on mm context switch + * then we use this callback head to queue kill work + * to kill tasks that are not running on SMT disabled + * cores + */ + struct callback_head l1d_flush_kill; +#endif + +#ifdef CONFIG_RV + /* + * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS. + * If we find justification for more monitors, we can think + * about adding more or developing a dynamic method. So far, + * none of these are justified. + */ + union rv_task_monitor rv[RV_PER_TASK_MONITORS]; +#endif + +#ifdef CONFIG_USER_EVENTS + struct user_event_mm *user_event_mm; +#endif + + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. + */ + randomized_struct_fields_end + + /* CPU-specific state of this task: */ + struct thread_struct thread; + + /* + * WARNING: on x86, 'thread_struct' contains a variable-sized + * structure. It *MUST* be at the end of 'task_struct'. + * + * Do not put anything below here! + */ +}; +``` \ No newline at end of file diff --git a/scratch/01-mm_struct.md b/scratch/01-mm_struct.md new file mode 100644 index 0000000..edf7910 --- /dev/null +++ b/scratch/01-mm_struct.md @@ -0,0 +1,275 @@ +# `#include ` +> **TODO** +> - `do_user_addr_fault` +> - `handle_mm_fault` +```c +// From v6.7-rc6 +struct mm_struct { + struct { + /* + * Fields which are often written to are placed in a separate + * cache line. + */ + // Zk. + // -- that is, this is a commonly-modified field which is best stored + // within a separate cache line. + struct { + /** + * @mm_count: The number of references to &struct + * mm_struct (@mm_users count as 1). + * + * Use mmgrab()/mmdrop() to modify. When this drops to + * 0, the &struct mm_struct is freed. + */ + // Zk. + // mmgrab() -- Pin a &struct mm_struct for a longer/unbounded amnt. of time. + // mmdrop() -- Undo above. + atomic_t mm_count; + } ____cacheline_aligned_in_smp; // Zk. -- eq. to `__aligned__(64)` for x86 + + // Zk. + // Maple tree that stores VMA (Virtual Memory Area) -- of which each + // mm_struct will have multiple (see /proc/[0-9]*/maps). They corresp. + // to when userspace calls `mmap`. + struct maple_tree mm_mt; +#ifdef CONFIG_MMU + unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); +#endif + unsigned long mmap_base; /* base of mmap area */ + unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* Base addresses for compatible mmap() */ + unsigned long mmap_compat_base; + unsigned long mmap_compat_legacy_base; +#endif + unsigned long task_size; /* size of task vm space */ + + // Zk. + // Global page table + pgd_t * pgd; + +#ifdef CONFIG_MEMBARRIER + /** + * @membarrier_state: Flags controlling membarrier behavior. + * + * This field is close to @pgd to hopefully fit in the same + * cache-line, which needs to be touched by switch_mm(). + */ + atomic_t membarrier_state; +#endif + + /** + * @mm_users: The number of users including userspace. + * + * Use mmget()/mmget_not_zero()/mmput() to modify. When this + * drops to 0 (i.e. when the task exits and there are no other + * temporary reference holders), we also release a reference on + * @mm_count (which may then free the &struct mm_struct if + * @mm_count also drops to 0). + */ + atomic_t mm_users; + +#ifdef CONFIG_SCHED_MM_CID + /** + * @pcpu_cid: Per-cpu current cid. + * + * Keep track of the currently allocated mm_cid for each cpu. + * The per-cpu mm_cid values are serialized by their respective + * runqueue locks. + */ + struct mm_cid __percpu *pcpu_cid; + /* + * @mm_cid_next_scan: Next mm_cid scan (in jiffies). + * + * When the next mm_cid scan is due (in jiffies). + */ + unsigned long mm_cid_next_scan; +#endif +#ifdef CONFIG_MMU + atomic_long_t pgtables_bytes; /* size of all page tables */ +#endif + int map_count; /* number of VMAs */ + + spinlock_t page_table_lock; /* Protects page tables and some + * counters + */ + /* + * With some kernel config, the current mmap_lock's offset + * inside 'mm_struct' is at 0x120, which is very optimal, as + * its two hot fields 'count' and 'owner' sit in 2 different + * cachelines, and when mmap_lock is highly contended, both + * of the 2 fields will be accessed frequently, current layout + * will help to reduce cache bouncing. + * + * So please be careful with adding new fields before + * mmap_lock, which can easily push the 2 fields into one + * cacheline. + */ + struct rw_semaphore mmap_lock; + + struct list_head mmlist; /* List of maybe swapped mm's. These + * are globally strung together off + * init_mm.mmlist, and are protected + * by mmlist_lock + */ +#ifdef CONFIG_PER_VMA_LOCK + /* + * This field has lock-like semantics, meaning it is sometimes + * accessed with ACQUIRE/RELEASE semantics. + * Roughly speaking, incrementing the sequence number is + * equivalent to releasing locks on VMAs; reading the sequence + * number can be part of taking a read lock on a VMA. + * + * Can be modified under write mmap_lock using RELEASE + * semantics. + * Can be read with no other protection when holding write + * mmap_lock. + * Can be read with ACQUIRE semantics if not holding write + * mmap_lock. + */ + int mm_lock_seq; +#endif + + + unsigned long hiwater_rss; /* High-watermark of RSS usage */ + unsigned long hiwater_vm; /* High-water virtual memory usage */ + + unsigned long total_vm; /* Total pages mapped */ + unsigned long locked_vm; /* Pages that have PG_mlocked set */ + atomic64_t pinned_vm; /* Refcount permanently increased */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ + unsigned long def_flags; + + /** + * @write_protect_seq: Locked when any thread is write + * protecting pages mapped by this mm to enforce a later COW, + * for instance during page table copying for fork(). + */ + seqcount_t write_protect_seq; + + spinlock_t arg_lock; /* protect the below fields */ + + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + + struct percpu_counter rss_stat[NR_MM_COUNTERS]; + + struct linux_binfmt *binfmt; + + /* Architecture-specific MM context */ + mm_context_t context; + + unsigned long flags; /* Must use atomic bitops to access */ + +#ifdef CONFIG_AIO + spinlock_t ioctx_lock; + struct kioctx_table __rcu *ioctx_table; +#endif +#ifdef CONFIG_MEMCG + /* + * "owner" points to a task that is regarded as the canonical + * user/owner of this mm. All of the following must be true in + * order for it to be changed: + * + * current == mm->owner + * current->mm != mm + * new_owner->mm == mm + * new_owner->alloc_lock is held + */ + struct task_struct __rcu *owner; +#endif + struct user_namespace *user_ns; + + /* store ref to file /proc//exe symlink points to */ + struct file __rcu *exe_file; +#ifdef CONFIG_MMU_NOTIFIER + struct mmu_notifier_subscriptions *notifier_subscriptions; +#endif +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ +#endif +#ifdef CONFIG_NUMA_BALANCING + /* + * numa_next_scan is the next time that PTEs will be remapped + * PROT_NONE to trigger NUMA hinting faults; such faults gather + * statistics and migrate pages to new nodes if necessary. + */ + unsigned long numa_next_scan; + + /* Restart point for scanning and remapping PTEs. */ + unsigned long numa_scan_offset; + + /* numa_scan_seq prevents two threads remapping PTEs. */ + int numa_scan_seq; +#endif + /* + * An operation with batched TLB flushing is going on. Anything + * that can move process memory needs to flush the TLB when + * moving a PROT_NONE mapped page. + */ + atomic_t tlb_flush_pending; +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + /* See flush_tlb_batched_pending() */ + atomic_t tlb_flush_batched; +#endif + struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif +#ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; +#endif + struct work_struct async_put_work; + +#ifdef CONFIG_IOMMU_SVA + u32 pasid; +#endif +#ifdef CONFIG_KSM + /* + * Represent how many pages of this process are involved in KSM + * merging (not including ksm_zero_pages). + */ + unsigned long ksm_merging_pages; + /* + * Represent how many pages are checked for ksm merging + * including merged and not merged. + */ + unsigned long ksm_rmap_items; + /* + * Represent how many empty pages are merged with kernel zero + * pages when enabling KSM use_zero_pages. + */ + unsigned long ksm_zero_pages; +#endif /* CONFIG_KSM */ +#ifdef CONFIG_LRU_GEN + struct { + /* this mm_struct is on lru_gen_mm_list */ + struct list_head list; + /* + * Set when switching to this mm_struct, as a hint of + * whether it has been used since the last time per-node + * page table walkers cleared the corresponding bits. + */ + unsigned long bitmap; +#ifdef CONFIG_MEMCG + /* points to the memcg of "owner" above */ + struct mem_cgroup *memcg; +#endif + } lru_gen; +#endif /* CONFIG_LRU_GEN */ + } __randomize_layout; + + /* + * The mm_cpumask needs to be at the end of mm_struct, because it + * is dynamically sized based on nr_cpu_ids. + */ + unsigned long cpu_bitmap[]; +}; +``` diff --git a/scratch/consistency-coherence.md b/scratch/consistency-coherence.md index 5c5ce8c..cf5a830 100644 --- a/scratch/consistency-coherence.md +++ b/scratch/consistency-coherence.md @@ -19,7 +19,7 @@ In floating home protocols, some orchestration is needed for a stale node to be A scope-consistency protocol is one where each sharing is associated with a scope -- only loads and stores within the scope's participants is guaranteed to respect the memory model. -Many newer systems don't even expose writable data movement (e.g., Grappa) -- to write, you have to move compute towards data. Reading is no-problem though. +Many newer systems don't even expose writable data-to-compute (e.g., Grappa) -- to write, you have to move compute towards data. Reading is no-problem though. [7]'s case for acquire-invalidate holds only for MRSW (I think?). diff --git a/scratch/consistency-coherence.pdf b/scratch/consistency-coherence.pdf new file mode 100644 index 0000000..d462df7 Binary files /dev/null and b/scratch/consistency-coherence.pdf differ