Skip to the content.

Page Table Flags

一共都有那些 flags ?

pte ,还可以 huge 吗?

static inline pte_t pte_mkhuge(pte_t pte)
{
	return pte_set_flags(pte, _PAGE_PSE);
}

为什么要这么实现?

pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
	if (vma->vm_flags & VM_SHADOW_STACK) // 先不看这个 security 的东西了
		return pte_mkwrite_shstk(pte);

	pte = pte_mkwrite_novma(pte); // 什么叫做 novma ?

	return pte_clear_saveddirty(pte); // 多此一举吗?
}

看看 arch/x86/include/asm/pgtable.h

static inline pte_t pte_mksaveddirty(pte_t pte)
{
	pteval_t v = native_pte_val(pte);

	v = mksaveddirty_shift(v);
	return native_make_pte(v);
}

pte_mkdirty

这个容易理解:

/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
	if (likely(vma->vm_flags & VM_WRITE))
		pte = pte_mkwrite(pte, vma);
	return pte;
}

pte_mkdirty 的调用位置主要是在 page fault 当分配新的 page 的时候, 立刻标记上 dirty

static inline pte_t pte_mkdirty(pte_t pte)
{
	pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

	return pte_mksaveddirty(pte);
}

要记住,之所以可以 pte_mkdirty ,是由于

@[
    pte_mkdirty+101
    do_swap_page+1740
    handle_mm_fault+2151
    do_user_addr_fault+780
    exc_page_fault+117
    asm_exc_page_fault+38
    copy_fpstate_to_sigframe+426
    get_sigframe+442
    x64_setup_rt_frame+121
    arch_do_signal_or_restart+306
    syscall_exit_to_user_mode+85
    do_syscall_64+250
    entry_SYSCALL_64_after_hwframe+119
]: 1

@[
    wp_page_reuse+203
    do_wp_page+1672
    handle_mm_fault+2657
    do_user_addr_fault+780
    exc_page_fault+117
    asm_exc_page_fault+38
    filldir64+208
    ext4_readdir+2428
    iterate_dir+123
    __se_sys_getdents64+105
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 2

pte_mkclean : clean 的位置

任何时候,都不会给 anonymous page 来 mkclean 吧 应该是的,使用 stress-ng 制作 swap ,只能观测到

🧀  sudo bpftrace -e "tracepoint:kmem:hi { @[kstack] = count(); }"
Attaching 1 probe...
^C

@[
    xueshi+73
    page_vma_mkclean_one+231
    page_mkclean_one+142
    rmap_walk_file+307
    folio_mkclean+184
    folio_clear_dirty_for_io+93
    mpage_prepare_extent_to_map+704
    ext4_do_writepages+865
    ext4_normal_submit_inode_data_buffers+238
    jbd2_journal_commit_transaction+1242
    kjournald2+176
    kthread+248
    ret_from_fork+55
    ret_from_fork_asm+26
]: 8

folio_mark_dirty 对比分析

这调用位置,只能说,只能说,这些调用位置都是预期之外的:

@[
    folio_mark_dirty+5
    fault_dirty_shared_page+74 // 被 do_shared_fault 和 wp_page_shared 无条件调用
    do_pte_missing+1014
    handle_mm_fault+2183
    __get_user_pages+1100
    get_user_pages_unlocked+264
    hva_to_pfn+264
    gfn_to_page+14
    kvm_alloc_apic_access_page+101
    vmx_vcpu_create+565
    kvm_arch_vcpu_create+471
    kvm_vm_ioctl_create_vcpu+348
    kvm_vm_ioctl+1804
    __se_sys_ioctl+110
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 1
@[
    folio_mark_dirty+5
    filemap_page_mkwrite+349
    do_wp_page+2066
    handle_mm_fault+2657
    do_user_addr_fault+504
    exc_page_fault+117
    asm_exc_page_fault+38
]: 1
@[
    folio_mark_dirty+5 // 为什么 unmap 的也需要 ?
    unmap_page_range+2762
    unmap_vmas+241
    exit_mmap+498
    __mmput+67
    exit_mm+223
    do_exit+512
    do_group_exit+140
    __x64_sys_exit_group+23
    __pfx_ia32_emulation_override_cmdline+0
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 2
@[
    folio_mark_dirty+5
    unmap_page_range+4535
    unmap_vmas+241
    unmap_region+338
    do_vmi_align_munmap+946
    do_vmi_munmap+263
    __vm_munmap+199
    __x64_sys_munmap+27
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 3
@[
    folio_mark_dirty+5
    shmem_swapin_folio+1047
    shmem_get_folio_gfp+151
    shmem_fault+134
    __do_fault+67
    do_pte_missing+354
    handle_mm_fault+2183
    do_user_addr_fault+504
    exc_page_fault+117
    asm_exc_page_fault+38
]: 4
@[
    folio_mark_dirty+5
    fault_dirty_shared_page+74
    do_pte_missing+1014
    handle_mm_fault+2183
    do_user_addr_fault+780
    exc_page_fault+117
    asm_exc_page_fault+38
]: 5
@[
    folio_mark_dirty+5 // .. swap 进来,那么立刻标记下
    shmem_swapin_folio+1047
    shmem_get_folio_gfp+151
    shmem_fault+134
    __do_fault+67
    do_pte_missing+831
    handle_mm_fault+2183
    do_user_addr_fault+504
    exc_page_fault+117
    asm_exc_page_fault+38
]: 7
@[
    folio_mark_dirty+5
    unmap_page_range+2762
    unmap_vmas+241
    exit_mmap+498
    __mmput+67
    exit_mm+223
    do_exit+290
    do_group_exit+140
    __x64_sys_exit_group+23
    __pfx_ia32_emulation_override_cmdline+0
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 8
@[
    folio_mark_dirty+5
    unmap_page_range+2762
    unmap_vmas+241
    exit_mmap+498
    __mmput+67
    exit_mm+223
    do_exit+512
    do_group_exit+120
    get_signal+1710
    arch_do_signal_or_restart+142
    irqentry_exit_to_user_mode+75
    asm_sysvec_reschedule_ipi+26
]: 16
@[
    folio_mark_dirty+5
    shmem_write_end+367
    generic_perform_write+425
    shmem_file_write_iter+107
    vfs_write+935
    ksys_write+114
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 18
@[
    folio_mark_dirty+5
    unmap_page_range+2762
    unmap_vmas+241
    exit_mmap+498
    __mmput+67
    exit_mm+223
    do_exit+512
    vhost_task_fn+285
    ret_from_fork+55
    ret_from_fork_asm+26
]: 31
@[
    folio_mark_dirty+5
    unmap_page_range+2762
    unmap_vmas+241
    unmap_region+338
    do_vmi_align_munmap+946
    do_vmi_munmap+263
    __vm_munmap+199
    __x64_sys_munmap+27
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 37
@[
    folio_mark_dirty+5
    bio_set_pages_dirty+224
    iomap_dio_bio_iter+786
    __iomap_dio_rw+680
    iomap_dio_rw+18
    ext4_file_read_iter+234
    vfs_read+750
    __x64_sys_pread64+109
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 37
@[
    folio_mark_dirty+5
    fault_dirty_shared_page+74
    do_pte_missing+1014
    handle_mm_fault+2183
    do_user_addr_fault+504
    exc_page_fault+117
    asm_exc_page_fault+38
]: 49
@[
    folio_mark_dirty+5
    unmap_page_range+2762
    unmap_vmas+241
    exit_mmap+498
    __mmput+67
    exit_mm+223
    do_exit+512
    do_group_exit+120
    get_signal+1710
    arch_do_signal_or_restart+142
    syscall_exit_to_user_mode+85
    do_syscall_64+250
    entry_SYSCALL_64_after_hwframe+119
]: 82
@[
    folio_mark_dirty+5
    block_page_mkwrite+344
    ext4_page_mkwrite+851
    do_wp_page+2066
    handle_mm_fault+2657
    do_user_addr_fault+504
    exc_page_fault+117
    asm_exc_page_fault+38
]: 132
@[
    folio_mark_dirty+5
    fault_dirty_shared_page+74
    do_wp_page+2124
    handle_mm_fault+2657
    do_user_addr_fault+504
    exc_page_fault+117
    asm_exc_page_fault+38
]: 133
@[
    folio_mark_dirty+5
    try_to_migrate_one+906
    rmap_walk_anon+319
    try_to_migrate+214
    migrate_pages_batch+944
    migrate_pages+1911
    compact_zone+3555
    compact_node+195
    kcompactd+1397
    kthread+248
    ret_from_fork+55
    ret_from_fork_asm+26
]: 1945
@[
    folio_mark_dirty+5 // 都 dio ,为什么还需要 mark ? 看看 bio_set_pages_dirty 的 comment
    bio_set_pages_dirty+224
    iomap_dio_bio_iter+786
    __iomap_dio_rw+680
    iomap_dio_rw+18
    ext4_file_read_iter+234
    aio_read+456
    io_submit_one+1608
    __se_sys_io_submit+189
    do_syscall_64+237
    entry_SYSCALL_64_after_hwframe+119
]: 11920

从 pte dirty 到 folio dirty 的传播

几乎都发生到 rmap 中,非常合理

		/* Set the dirty flag on the folio now the pte is gone. */
		if (pte_dirty(pteval))
			folio_mark_dirty(folio);

PG_dirty 的作用

案例 1 : 从 block_dirty_folio 中可以看到,检查了有这个 flag ,如果是 newly dirty ,那么才去 到 address space 中去标记这些东西:

	if (newly_dirty)
		__folio_mark_dirty(folio, mapping, 1);

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。