This module provides an in-depth exploration of the Linux page fault handling mechanism. You will trace the complete path from CPU exception to page allocation.
A page fault occurs when the CPU tries to access a virtual address that:
Program executes: MOV RAX, [0x7FFE12345678]
│
▼
┌───────────┐
│ MMU Check │
└─────┬─────┘
│
┌───────────────┴───────────────┐
│ │
PTE Present? PTE Absent?
│ │
▼ ▼
Access OK CPU raises exception #14
│ │
▼ ▼
Return data Jump to page fault handler
When a page fault occurs, the CPU:
1. Push SS (stack segment)
2. Push RSP (stack pointer)
3. Push RFLAGS (flags)
4. Push CS (code segment)
5. Push RIP (instruction pointer)
6. Push error_code (fault information)
7. Load CR2 with faulting address
8. Jump to IDT[14] (page fault handler)
Error Code (32 bits):
┌────┬────┬────┬────┬────┬────┬─────────────────────┐
│ 31 │ ...│ 4 │ 3 │ 2 │ 1 │ 0 │
├────┼────┼────┼────┼────┼────┼─────────────────────┤
│ 0 │ 0 │ I │RSVD│ U │ W │ P │
└────┴────┴────┴────┴────┴────┴─────────────────────┘
P (bit 0): 1 = protection violation, 0 = page not present
W (bit 1): 1 = write access, 0 = read access
U (bit 2): 1 = user mode, 0 = kernel mode
RSVD (bit 3): 1 = reserved bit set in PTE
I (bit 4): 1 = instruction fetch
malloc(4096) returns 0x5555555AA000
ptr[0] = 'A'; // First write
Error code = 0x6 = 0b0110
P=0: Page was NOT present (demand paging)
W=1: Write access
U=1: User mode
RSVD=0: No reserved bit violation
I=0: Not instruction fetch
CR2 = 0x5555555AA000 (faulting address)
exc_page_fault() [arch/x86/mm/fault.c]
│
▼
do_user_addr_fault() [arch/x86/mm/fault.c]
│
│ Lock mm->mmap_lock
│ Find VMA containing address
▼
handle_mm_fault() [mm/memory.c]
│
│ Walk/allocate page tables
▼
__handle_mm_fault()
│
│ Get PMD, handle huge pages
▼
handle_pte_fault() [mm/memory.c]
│
├──► do_anonymous_page() [First access to anon mem]
├──► do_fault() [File-backed page]
├──► do_swap_page() [Page in swap]
└──► do_wp_page() [Copy-on-write]
// arch/x86/mm/fault.c
DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
unsigned long address = read_cr2(); // Get faulting address
// ... error handling ...
handle_page_fault(regs, error_code, address);
}
static void handle_page_fault(struct pt_regs *regs,
unsigned long error_code,
unsigned long address)
{
// Kernel address? Handle differently
if (unlikely(fault_in_kernel_space(address))) {
do_kern_addr_fault(regs, error_code, address);
return;
}
// User address
do_user_addr_fault(regs, error_code, address);
}
// arch/x86/mm/fault.c
static void do_user_addr_fault(struct pt_regs *regs,
unsigned long error_code,
unsigned long address)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
vm_fault_t fault;
// Lock the address space
mmap_read_lock(mm);
// Find VMA containing the faulting address
vma = find_vma(mm, address);
if (!vma || address < vma->vm_start) {
// No VMA found = SIGSEGV
bad_area(regs, error_code, address);
return;
}
// Check permissions
if (!access_allowed(vma, error_code)) {
bad_area_access_error(regs, error_code, address);
return;
}
// Handle the fault
fault = handle_mm_fault(vma, address, flags, regs);
mmap_read_unlock(mm);
}
// mm/memory.c
vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
unsigned long address,
unsigned int flags,
struct pt_regs *regs)
{
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK, // Page-aligned
.flags = flags,
.pgoff = linear_page_index(vma, address),
};
return __handle_mm_fault(vma, address, flags);
}
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address,
unsigned int flags)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
// Get or allocate each level
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
pud = pud_alloc(mm, p4d, address);
pmd = pmd_alloc(mm, pud, address);
// Handle PTE-level fault
return handle_pte_fault(&vmf);
}
malloc(4096) → returns ptr
ptr[0] = 'A' → PAGE FAULT
do_anonymous_page():
1. Allocate physical page via alloc_page()
2. Clear the page (zero-fill)
3. Create PTE pointing to new page
4. Return to userspace
// mm/memory.c
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct page *page;
pte_t entry;
// Allocate a new zeroed page
page = alloc_zeroed_user_highpage(vma, vmf->address);
if (!page)
return VM_FAULT_OOM;
// Increment page reference count
get_page(page);
// Create PTE entry
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(entry);
// Install PTE
set_pte_at(mm, vmf->address, vmf->pte, entry);
return 0;
}
fork() creates child with same page tables
Parent has PTE: PA=0x12345000, R/W, refcount=2
Child writes to page → PAGE FAULT (write to read-only)
do_wp_page():
1. Check if page is shared (refcount > 1)
2. Allocate new physical page
3. Copy contents from old page
4. Update child's PTE to point to new page
5. Mark new PTE as writable
// mm/memory.c
static vm_fault_t do_wp_page(struct vm_fault *vmf)
{
struct page *old_page = vmf->page;
struct page *new_page;
// Is page shared?
if (page_count(old_page) > 1) {
// Must copy
new_page = alloc_page(GFP_HIGHUSER);
copy_user_highpage(new_page, old_page, vmf->address);
// Update PTE to new page
entry = mk_pte(new_page, vma->vm_page_prot);
entry = pte_mkwrite(entry);
set_pte_at(mm, vmf->address, vmf->pte, entry);
// Release reference to old page
put_page(old_page);
} else {
// Exclusive access, just make writable
entry = pte_mkwrite(vmf->orig_pte);
set_pte_at(mm, vmf->address, vmf->pte, entry);
}
return 0;
}
// fault_trace.c
#include <linux/module.h>
#include <linux/kprobes.h>
static struct kprobe kp = {
.symbol_name = "handle_mm_fault",
};
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
struct vm_area_struct *vma = (void *)regs->di;
unsigned long address = regs->si;
unsigned int flags = regs->dx;
if (strcmp(current->comm, "my_program") == 0) {
pr_info("[FAULT] PID=%d addr=0x%lx flags=0x%x "
"vma=[0x%lx-0x%lx]\n",
current->pid, address, flags,
vma->vm_start, vma->vm_end);
}
return 0;
}
static int __init fault_trace_init(void)
{
kp.pre_handler = handler_pre;
return register_kprobe(&kp);
}
module_init(fault_trace_init);
static void __exit fault_trace_exit(void)
{
unregister_kprobe(&kp);
}
module_exit(fault_trace_exit);
MODULE_LICENSE("GPL");
[FAULT] PID=1234 addr=0x7f8a12340000 flags=0x255 vma=[0x7f8a12340000-0x7f8a12440000]
[FAULT] PID=1234 addr=0x7ffd12345000 flags=0x255 vma=[0x7ffd12300000-0x7ffd12400000]
Given these error codes, determine fault cause:
Write a C program that triggers:
Create a kprobe that counts page faults per process.
GIVEN: error_code = 0x17
TASK: Decode each bit
1. 0x17 in binary = ___ ___ ___ ___ ___ (5 bits)
2. bit[0] (P) = ___ → page present? ___
3. bit[1] (W) = ___ → write access? ___
4. bit[2] (U) = ___ → user mode? ___
5. bit[3] (RSVD) = ___ → reserved bit violation? ___
6. bit[4] (I) = ___ → instruction fetch? ___
DESCRIBE FAULT: ________________________________
GIVEN:
CR2 = 0x7FFE_1234_5678
VMA: vm_start=0x7FFE_0000_0000, vm_end=0x7FFF_0000_0000
TASK:
1. Is address in VMA? vm_start ≤ CR2 < vm_end → ___ ≤ ___ < ___ → YES/NO
2. Offset into VMA = CR2 - vm_start = ___ - ___ = ___
3. Page offset = CR2 & 0xFFF = ___
4. Page number within VMA = offset / 4096 = ___
GIVEN: User writes to address 0x5555_5678_9000, first access, anonymous VMA
TASK: Fill call chain
1. CPU exception → exc_page_fault(regs, error_code=___)
2. error_code bits: P=___ W=___ U=___ → demand paging / COW / protection?
3. → do_user_addr_fault() → find_vma(mm, 0x5555_5678_9000) → VMA found?
4. → handle_mm_fault(vma, addr, flags) → __handle_mm_fault()
5. → handle_pte_fault() → PTE present? NO → do_anonymous_page() / do_fault()?
6. → alloc_page(GFP_HIGHUSER) → returns struct page at ___
7. → mk_pte(page, vm_page_prot) → creates PTE = ___
8. → set_pte_at() → installs PTE in page table
GIVEN:
Parent PTE[100] = 0x12345_003 (PA=0x12345000, flags=003=present+write)
fork() creates child, marks PTEs read-only
Child writes to page
TASK:
1. After fork, Parent PTE[100] = 0x12345_001 (write bit cleared) ✓
2. After fork, Child PTE[100] = 0x12345_001 (same PA, read-only) ✓
3. page->_refcount = 2 (shared between parent and child)
4. Child writes → error_code = ___ (P=1, W=1, U=1) = 0x___
5. do_wp_page() checks: page_count(page) = ___ → must copy? YES/NO
6. New page allocated at PA = 0xABCDE000
7. copy_user_highpage(new, old) → copies 4096 bytes
8. Child PTE[100] = 0xABCDE_003 (new PA, writable)
9. Old page->_refcount = ___ (decremented)
GIVEN: kprobe on handle_mm_fault
handle_mm_fault(struct vm_area_struct *vma, unsigned long addr, unsigned int flags, struct pt_regs *regs)
x86_64 ABI:
arg1 = RDI, arg2 = RSI, arg3 = RDX, arg4 = RCX
TASK: Map registers to arguments
1. vma pointer = regs->___ = (struct vm_area_struct *)regs->___
2. address = regs->___ = ___
3. flags = regs->___ = ___
4. pt_regs = regs->___ = ___
GIVEN: regs->di = 0xFFFF8881_12340000, regs->si = 0x7FFE_5678_9000
5. vma = ___
6. faulting address = ___
FAILURE 1: error_code bit order wrong → misidentify fault type
FAILURE 2: Forgetting vm_end is exclusive → incorrectly say address not in VMA
FAILURE 3: Confusing P=0 (not present) with P=1 (protection fault)
FAILURE 4: x86_64 ABI: arg order RDI,RSI,RDX,RCX,R8,R9 → not RAX,RBX,RCX
FAILURE 5: COW page shared → refcount > 1 → must copy, not just make writable
FAILURE 6: After fork, PTEs point to SAME physical page, not copied
error_code = 0x7 = 0b00111
bit[0]=1 → page present (protection fault, not absent)
bit[1]=1 → write access attempted
bit[2]=1 → user mode
∴ User tried to write to present read-only page → COW fault
malloc(1GB) → returns VA 0x7F0000000000
Pages allocated at malloc? 0 pages
First write triggers fault → 1 page allocated
1GB / 4KB = 262144 page faults if fully used
Lazy allocation saves: 262144 × 4KB = 1GB RAM if never touched
IDT[14] → exc_page_fault at 0xFFFFFFFF812A0000
CR2 loaded with faulting address by CPU
Kernel stack at 0xFFFF888100001000
Handler reads CR2: asm("mov %%cr2, %0" : "=r"(addr))
Process PID=1234 with mm→pgd at 0x12340000
VMA at [0x7F0000000000, 0x7F0000100000)
Instruction at RIP=0x401234 does: MOV [0x7F0000050000], RAX
PTE for 0x7F0000050000 = 0 (not present)
→ CPU raises fault, kernel handles for PID 1234
T₁: malloc(4096), ptr=0x555555555000, no fault
T₂: ptr[0] = 'A' → fault, error_code=0x6 (P=0,W=1,U=1), do_anonymous_page()
T₃: fork(), child PTE marked read-only
T₄: child writes ptr[0] = 'B' → fault, error_code=0x7 (P=1,W=1,U=1), do_wp_page()
Process needs 1GB heap
Without demand paging: allocate 1GB immediately
= 262144 pages × alloc_page() = 262144 calls
= 262144 × 4096 bytes zeroed
Time: 262144 × 500ns = 131ms at startup
With demand paging: 0 pages at malloc, fault as needed
Startup time: ~0ms
Only pay for pages actually touched
error_code & 1 = 0 → not present → do_anonymous_page() OR do_fault()
error_code & 1 = 1 → present → do_wp_page() (COW)
error_code & 2 = 0 → read fault
error_code & 2 = 2 → write fault
error_code & 4 = 0 → kernel mode
error_code & 4 = 4 → user mode
error_code = 0x15 = 0b10101
bit0 = 1 → P=1 (present)
bit1 = 0 → W=0 (read)
bit2 = 1 → U=1 (user)
bit3 = 0 → RSVD=0
bit4 = 1 → I=1 (instruction fetch)
∴ User tried to execute from present non-executable page
CR2 = 0x7FFE_FFFF_FFFF
VMA: vm_start=0x7FFE_0000_0000, vm_end=0x7FFF_0000_0000
Check: 0x7FFE_0000_0000 ≤ 0x7FFE_FFFF_FFFF < 0x7FFF_0000_0000
0x7FFE_0000_0000 ≤ 0x7FFE_FFFF_FFFF ✓
0x7FFE_FFFF_FFFF < 0x7FFF_0000_0000 ✓
∴ Address IS in VMA
CR2 = 0x7FFE_1234_5678
VMA starts at 0x7FFE_0000_0000
Offset = 0x7FFE_1234_5678 - 0x7FFE_0000_0000 = 0x1234_5678 = 305419896 bytes
Page number = 305419896 / 4096 = 74565 (floor)
Page offset = 305419896 % 4096 = 1656 bytes into page
Before fork: page refcount = 1
After fork: parent refs + child refs = 1 + 1 = 2
mapcount: parent PTE + child PTE = 2 mappings
Child COW write:
- new page refcount = 1
- old page refcount = 2 - 1 = 1
1. Decode error_code bit-by-bit: bit0=P, bit1=W, bit2=U, bit3=RSVD, bit4=I
2. Check VMA containment: start ≤ addr < end (end exclusive!)
3. Calculate page offset: (addr - vm_start) / 4096
4. On fork: refcount → refcount+1, mark PTEs read-only
5. On COW: allocate new page, copy 4096 bytes, update child PTE
FAILURE 7: P=1 means protection fault, NOT that page allocation is needed
FAILURE 8: error_code=0x6 vs 0x7 → one bit difference changes entire path
FAILURE 9: VMA end is exclusive → addr=vm_end is OUTSIDE VMA
FAILURE 10: Must copy page data, not just update PTE → 4096 bytes moved
# Count page faults for a process
/usr/bin/time -v cat /dev/null 2>&1 | grep "Minor\|Major"
# WHAT: Minor = page in RAM but not in page table, Major = page on disk
# WHY: minor fault = just update PTE, major = disk I/O
# WHERE: do_page_fault → handle_mm_fault → do_anonymous_page
# WHO: CPU raises exception, kernel handles, process waits
# WHEN: first access to each new page
# WITHOUT: all pages would be allocated at malloc() → 1GB malloc = 262144 faults at once
# WHICH: error_code bits determine handler path
# CALCULATION:
# Program uses 10MB heap:
# Pages = 10MB / 4KB = 2560 pages
# First access to each = 2560 minor faults
# Time per fault = ~1μs
# Total fault time = 2560 × 1μs = 2.56ms
#
# SCALE:
# Small: 1 page = 1 fault = 1μs
# Mid: 1000 pages = 1000 faults = 1ms
# Large: 1GB = 262144 pages = 262144 faults = 262ms = 0.26 seconds
# Edge: 0 pages accessed = 0 faults
# Read page fault error codes from ftrace
sudo sh -c 'echo 1 > /sys/kernel/debug/tracing/events/exceptions/page_fault_user/enable'
cat /tmp/testfile & # Trigger some faults
sudo cat /sys/kernel/debug/tracing/trace | tail -5
# ERROR CODE BREAKDOWN:
# error_code = 0x6 = 0b0110
# bit[0] = 0 → page NOT present (demand paging)
# bit[1] = 1 → WRITE access
# bit[2] = 1 → USER mode
# bit[3] = 0 → no reserved bit violation
# bit[4] = 0 → not instruction fetch
# ∴ User process wrote to non-present page
#
# error_code = 0x7 = 0b0111
# bit[0] = 1 → page IS present (COW fault)
# Rest same → write to read-only page after fork
#
# MEMORY STATE:
# ┌──────────────────────────────────────────────────────────────┐
# │ Before fault: │
# │ PTE[addr] = 0x0000000000000000 (not present) │
# │ │
# │ After do_anonymous_page(): │
# │ PTE[addr] = 0x800000012345_8067 │
# │ │ │││││ │
# │ │ ││││└─ bit0=1 present │
# │ │ │││└── bit1=1 writable │
# │ │ ││└─── bit2=1 user │
# │ │ │└──── bit5=1 accessed │
# │ │ └───── bit6=1 dirty │
# │ └───────────────────── PFN=0x12345 │
# └──────────────────────────────────────────────────────────────┘
# Create parent-child to observe COW
cat << 'EOF' > /tmp/cow_test.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
int main() {
char *buf = malloc(4096);
buf[0] = 'P'; // Parent writes
printf("Parent: buf=%p, buf[0]='%c'\n", buf, buf[0]);
if (fork() == 0) {
printf("Child before write: buf[0]='%c'\n", buf[0]);
buf[0] = 'C'; // This triggers COW!
printf("Child after write: buf[0]='%c'\n", buf[0]);
_exit(0);
}
wait(NULL);
printf("Parent after child: buf[0]='%c'\n", buf[0]);
}
EOF
gcc /tmp/cow_test.c -o /tmp/cow_test && /tmp/cow_test
# MEMORY DIAGRAM:
# ┌───────────────────────────────────────────────────────────────────┐
# │ BEFORE FORK: │
# │ Parent PTE → PA 0x12345000, refcount=1 │
# │ │
# │ AFTER FORK: │
# │ Parent PTE → PA 0x12345000 (read-only now!) │
# │ Child PTE → PA 0x12345000 (same page, read-only) │
# │ page->_refcount = 2 │
# │ │
# │ CHILD WRITES buf[0] = 'C': │
# │ Fault! error_code = 0x7 (present + write + user) │
# │ do_wp_page() allocates NEW page at PA 0xABCDE000 │
# │ copy_user_highpage() copies 4096 bytes │
# │ Child PTE → PA 0xABCDE000 (writable) │
# │ old page->_refcount = 2 - 1 = 1 │
# │ Parent still points to old PA 0x12345000 │
# └───────────────────────────────────────────────────────────────────┘
#
# CALCULATION:
# Parent has 1000 pages of data
# fork() creates child with 1000 PTEs pointing to SAME pages
# Memory used = still ~1000 pages (4MB), not 2000
# Child modifies 100 pages → 100 COW faults → 100 new pages
# Total after COW = 1000 + 100 = 1100 pages (4.4MB)
# Without COW: 2 × 1000 = 2000 pages (8MB) from start
# Use perf to measure page fault latency
sudo perf stat -e page-faults,minor-faults,major-faults -- \
dd if=/dev/zero of=/tmp/test bs=4k count=1000 2>&1
# CALCULATION:
# 1000 pages × 4KB = 4MB written
# First write to each page = 1000 minor faults
# Each fault: ~1μs kernel time
# Total fault overhead = 1000 × 1μs = 1ms
# DD total time ≈ 10ms
# Fault overhead = 1ms / 10ms = 10% of time
#
# SCALE:
# 1GB file = 262144 pages = 262144 faults = 262ms just for faults
# If disk write = 1GB @ 500MB/s = 2 seconds
# Fault overhead = 262ms / 2000ms = 13%
#
# PARADOX: Why not prefault all pages?
# Answer: Most pages may never be written! Lazy is better.
Q1: Why does fork() NOT copy 1GB of parent memory immediately?
CALCULATION:
Parent has 1GB mapped
copy 1GB at 10GB/s = 100ms
But fork() returns in < 1ms
∴ fork() must NOT copy data, only PTEs
PTE count = 262144 (for 1GB)
PTE size = 262144 × 8 = 2MB
Copy 2MB at 10GB/s = 0.2ms ✓
Q2: If page fault handling takes 1μs, why is malloc(1GB) fast?
ANSWER: malloc() only reserves VA, doesn't fault
Faults happen on FIRST ACCESS
If you never access page N, page N never faults
Q3: COW fault copies 4KB even if you write 1 byte. Why?
ANSWER:
Page is unit of protection (PTE granularity = 4KB)
Cannot have byte-level PTE → would need 4KB × 1000 = 4MB for one page
Compromise: copy whole page, waste up to 4095 bytes
START: CPU_EXC → Vector=14 → ERR=0x4(User) → CR2=0x400000
S1. REGS_EXTRACT: CR2=0x400000 → ARG1 ERR=0x4 → ARG2 REGS->IP=0x00401234 → ARG3
S2. VMA_LOOKUP: MM->MM_RB root=0xFFFF88801000 SEARCH(0x400000): NODE=0xFFFF88801000(RANGE=0x300000-0x500000) 0x300000 <= 0x400000 < 0x500000 ? YES FOUND_VMA = 0xFFFF88801000 VMA->VM_FLAGS = 0x100073 (READ|WRITE|EXEC|PRIVATE)
S3. PERM_CHECK: ERR&2(WRITE)=0 ? YES (Read fault) VMA->VM_FLAGS&1(READ)=1 ? YES ACCESS_OK ✓
S4. PGD_WALK: MM->PGD = 0x1000_0000 INDEX = 0x400000 » 39 = 0 PGD[0] = 0x2000_0067 (PRESENT)
S5. PTE_WALK_FAIL: … PTE_ENTRY = 0x5000_0000 + (0x400000»12 & 0x1FF)*8 MEM[PTE_ENTRY] = 0 (NOT PRESENT) ∴ PAGE_FAULT_HANDLED_BY_KERNEL
S6. ALLOC_PAGE: BUDDY_ALLOC(ORDER=0) → PFN=0x99000 CLEAR_PAGE(0x99000) MK_PTE(0x99000, PROT_READ|PROT_WRITE) = 0x8000000099000067
S7. INSTALL_PTE: LOCK(PT) MEM[PTE_ENTRY] = 0x8000000099000067 UNLOCK(PT) RETURN_FROM_EXC
S8. RETRY: IRETQ → POP RIP → MOV [0x400000], RAX TLB_MISS → HARDWARE_WALK → FOUND_PTE EXECUTION_CONTINUES ✓
| ← Previous Lesson | Course Index | Next Lesson → |