This commit is contained in:
Zhengyi Chen 2023-12-24 16:13:56 +00:00
parent f059736e2d
commit 905f90200e
4 changed files with 1149 additions and 1 deletions

275
scratch/01-mm_struct.md Normal file
View file

@ -0,0 +1,275 @@
# `#include <linux/mm_types.h>`
> **TODO**
> - `do_user_addr_fault`
> - `handle_mm_fault`
```c
// From v6.7-rc6
struct mm_struct {
struct {
/*
* Fields which are often written to are placed in a separate
* cache line.
*/
// Zk.
// -- that is, this is a commonly-modified field which is best stored
// within a separate cache line.
struct {
/**
* @mm_count: The number of references to &struct
* mm_struct (@mm_users count as 1).
*
* Use mmgrab()/mmdrop() to modify. When this drops to
* 0, the &struct mm_struct is freed.
*/
// Zk.
// mmgrab() -- Pin a &struct mm_struct for a longer/unbounded amnt. of time.
// mmdrop() -- Undo above.
atomic_t mm_count;
} ____cacheline_aligned_in_smp; // Zk. -- eq. to `__aligned__(64)` for x86
// Zk.
// Maple tree that stores VMA (Virtual Memory Area) -- of which each
// mm_struct will have multiple (see /proc/[0-9]*/maps). They corresp.
// to when userspace calls `mmap`.
struct maple_tree mm_mt;
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
#endif
unsigned long mmap_base; /* base of mmap area */
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/* Base addresses for compatible mmap() */
unsigned long mmap_compat_base;
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
// Zk.
// Global page table
pgd_t * pgd;
#ifdef CONFIG_MEMBARRIER
/**
* @membarrier_state: Flags controlling membarrier behavior.
*
* This field is close to @pgd to hopefully fit in the same
* cache-line, which needs to be touched by switch_mm().
*/
atomic_t membarrier_state;
#endif
/**
* @mm_users: The number of users including userspace.
*
* Use mmget()/mmget_not_zero()/mmput() to modify. When this
* drops to 0 (i.e. when the task exits and there are no other
* temporary reference holders), we also release a reference on
* @mm_count (which may then free the &struct mm_struct if
* @mm_count also drops to 0).
*/
atomic_t mm_users;
#ifdef CONFIG_SCHED_MM_CID
/**
* @pcpu_cid: Per-cpu current cid.
*
* Keep track of the currently allocated mm_cid for each cpu.
* The per-cpu mm_cid values are serialized by their respective
* runqueue locks.
*/
struct mm_cid __percpu *pcpu_cid;
/*
* @mm_cid_next_scan: Next mm_cid scan (in jiffies).
*
* When the next mm_cid scan is due (in jiffies).
*/
unsigned long mm_cid_next_scan;
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
#endif
int map_count; /* number of VMAs */
spinlock_t page_table_lock; /* Protects page tables and some
* counters
*/
/*
* With some kernel config, the current mmap_lock's offset
* inside 'mm_struct' is at 0x120, which is very optimal, as
* its two hot fields 'count' and 'owner' sit in 2 different
* cachelines, and when mmap_lock is highly contended, both
* of the 2 fields will be accessed frequently, current layout
* will help to reduce cache bouncing.
*
* So please be careful with adding new fields before
* mmap_lock, which can easily push the 2 fields into one
* cacheline.
*/
struct rw_semaphore mmap_lock;
struct list_head mmlist; /* List of maybe swapped mm's. These
* are globally strung together off
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
/*
* This field has lock-like semantics, meaning it is sometimes
* accessed with ACQUIRE/RELEASE semantics.
* Roughly speaking, incrementing the sequence number is
* equivalent to releasing locks on VMAs; reading the sequence
* number can be part of taking a read lock on a VMA.
*
* Can be modified under write mmap_lock using RELEASE
* semantics.
* Can be read with no other protection when holding write
* mmap_lock.
* Can be read with ACQUIRE semantics if not holding write
* mmap_lock.
*/
int mm_lock_seq;
#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
unsigned long total_vm; /* Total pages mapped */
unsigned long locked_vm; /* Pages that have PG_mlocked set */
atomic64_t pinned_vm; /* Refcount permanently increased */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
unsigned long stack_vm; /* VM_STACK */
unsigned long def_flags;
/**
* @write_protect_seq: Locked when any thread is write
* protecting pages mapped by this mm to enforce a later COW,
* for instance during page table copying for fork().
*/
seqcount_t write_protect_seq;
spinlock_t arg_lock; /* protect the below fields */
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
struct percpu_counter rss_stat[NR_MM_COUNTERS];
struct linux_binfmt *binfmt;
/* Architecture-specific MM context */
mm_context_t context;
unsigned long flags; /* Must use atomic bitops to access */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
/*
* "owner" points to a task that is regarded as the canonical
* user/owner of this mm. All of the following must be true in
* order for it to be changed:
*
* current == mm->owner
* current->mm != mm
* new_owner->mm == mm
* new_owner->alloc_lock is held
*/
struct task_struct __rcu *owner;
#endif
struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* numa_next_scan is the next time that PTEs will be remapped
* PROT_NONE to trigger NUMA hinting faults; such faults gather
* statistics and migrate pages to new nodes if necessary.
*/
unsigned long numa_next_scan;
/* Restart point for scanning and remapping PTEs. */
unsigned long numa_scan_offset;
/* numa_scan_seq prevents two threads remapping PTEs. */
int numa_scan_seq;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
* moving a PROT_NONE mapped page.
*/
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* See flush_tlb_batched_pending() */
atomic_t tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_PREEMPT_RT
struct rcu_head delayed_drop;
#endif
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
#ifdef CONFIG_IOMMU_SVA
u32 pasid;
#endif
#ifdef CONFIG_KSM
/*
* Represent how many pages of this process are involved in KSM
* merging (not including ksm_zero_pages).
*/
unsigned long ksm_merging_pages;
/*
* Represent how many pages are checked for ksm merging
* including merged and not merged.
*/
unsigned long ksm_rmap_items;
/*
* Represent how many empty pages are merged with kernel zero
* pages when enabling KSM use_zero_pages.
*/
unsigned long ksm_zero_pages;
#endif /* CONFIG_KSM */
#ifdef CONFIG_LRU_GEN
struct {
/* this mm_struct is on lru_gen_mm_list */
struct list_head list;
/*
* Set when switching to this mm_struct, as a hint of
* whether it has been used since the last time per-node
* page table walkers cleared the corresponding bits.
*/
unsigned long bitmap;
#ifdef CONFIG_MEMCG
/* points to the memcg of "owner" above */
struct mem_cgroup *memcg;
#endif
} lru_gen;
#endif /* CONFIG_LRU_GEN */
} __randomize_layout;
/*
* The mm_cpumask needs to be at the end of mm_struct, because it
* is dynamically sized based on nr_cpu_ids.
*/
unsigned long cpu_bitmap[];
};
```