Skip to the content.

Workingset

参考内容:

/proc/vmstat

workingset_nodes 146606
workingset_refault_anon 4079
workingset_refault_file 5127
workingset_activate_anon 0
workingset_activate_file 0
workingset_restore_anon 3594
workingset_restore_file 839
workingset_nodereclaim 51530

PG_workingset

PAGEFLAG(Workingset, workingset, PF_HEAD)
	TESTCLEARFLAG(Workingset, workingset, PF_HEAD)

可以展开为:

static __always_inline bool folio_test_workingset(struct folio *folio) {
  return test_bit(PG_workingset, folio_flags(folio, FOLIO_PF_HEAD));
}
static __always_inline int PageWorkingset(struct page *page) {
  return test_bit(PG_workingset, &PF_HEAD(page, 0)->flags);
}
static __always_inline void folio_set_workingset(struct folio *folio) {
  set_bit(PG_workingset, folio_flags(folio, FOLIO_PF_HEAD));
}
static __always_inline void SetPageWorkingset(struct page *page) {
  set_bit(PG_workingset, &PF_HEAD(page, 1)->flags);
}
static __always_inline void folio_clear_workingset(struct folio *folio) {
  clear_bit(PG_workingset, folio_flags(folio, FOLIO_PF_HEAD));
}
static __always_inline void ClearPageWorkingset(struct page *page) {
  clear_bit(PG_workingset, &PF_HEAD(page, 1)->flags);
}

static __always_inline bool folio_test_clear_workingset(struct folio *folio) {
  return test_and_clear_bit(PG_workingset, folio_flags(folio, FOLIO_PF_HEAD));
}
static __always_inline int TestClearPageWorkingset(struct page *page) {
  return test_and_clear_bit(PG_workingset, &PF_HEAD(page, 1)->flags);
}

working set

snapshot_refaults

如果在 prepare_scan_count 中,检测,然后设置 scan_control::may_deactivate,如果 是设置为 true,那么将会在 shrink_list 中调用 shrink_active_list 的,否则只是调用 shrink_inactive_list

	/*
	 * Target desirable inactive:active list ratios for the anon
	 * and file LRU lists.
	 */
	if (!sc->force_deactivate) {
		unsigned long refaults;

		/*
		 * When refaults are being observed, it means a new
		 * workingset is being established. Deactivate to get
		 * rid of any stale active pages quickly.
		 */
		refaults = lruvec_page_state(target_lruvec,
				WORKINGSET_ACTIVATE_ANON);
		if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
			sc->may_deactivate |= DEACTIVATE_ANON;
		else
			sc->may_deactivate &= ~DEACTIVATE_ANON;

		refaults = lruvec_page_state(target_lruvec,
				WORKINGSET_ACTIVATE_FILE);
		if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
			sc->may_deactivate |= DEACTIVATE_FILE;
		else
			sc->may_deactivate &= ~DEACTIVATE_FILE;
	} else
		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;

———–> 理解一下 lruvec_page_state 的含义是什么

flush 的过程是因为统计数据需要向上汇总,从而实现 memcg_vmstats 和 memcg_vmstats_percpu 将数据调用进来的:

#0  mem_cgroup_css_rstat_flush (css=0xffff88816a1ab000, cpu=0) at include/linux/memcontrol.h:852
#1  0xffffffff811d1d92 in cgroup_rstat_flush_locked (cgrp=cgrp@entry=0xffffffff82b54d90 <cgrp_dfl_root+16>, may_sleep=may_sleep@entry=false) at kernel/cgroup/rstat.c:205
#2  0xffffffff811d1e9f in cgroup_rstat_flush_irqsafe (cgrp=0xffffffff82b54d90 <cgrp_dfl_root+16>) at kernel/cgroup/rstat.c:254
#3  0xffffffff813441e6 in __mem_cgroup_flush_stats () at mm/memcontrol.c:635
#4  0xffffffff81347d9a in mem_cgroup_flush_stats () at mm/memcontrol.c:643
#5  0xffffffff812afa64 in prepare_scan_count (sc=0xffffc900017efc68, pgdat=0xffff88823fff9000) at mm/vmscan.c:2787
#6  shrink_node (pgdat=pgdat@entry=0xffff88823fff9000, sc=sc@entry=0xffffc900017efc68) at mm/vmscan.c:6103
#7  0xffffffff812b0e10 in shrink_zones (sc=0xffffc900017efc68, zonelist=<optimized out>) at mm/vmscan.c:6343
#8  do_try_to_free_pages (zonelist=zonelist@entry=0xffff88823fffaa00, sc=sc@entry=0xffffc900017efc68) at mm/vmscan.c:6405

通过如下窗口函数,来调用 memcg_vmstats 和 memcg_vmstats_percpu

workingset_refault

#0  workingset_refault (folio=folio@entry=0xffffea00060c3b40, shadow=0x245001101) at arch/x86/include/asm/bitops.h:206
#1  0xffffffff8130886e in __read_swap_cache_async (entry=entry@entry=..., gfp_mask=gfp_mask@entry=1051850, vma=vma@entry=0xffff88816fd49000, addr=addr@entry=139841821999104, new_page_allocated=new_page_allocated@entry=0xffffc900017efd06) at mm/swap_state.c:496
#2  0xffffffff81308a86 in swap_cluster_readahead (entry=..., gfp_mask=gfp_mask@entry=1051850, vmf=vmf@entry=0xffffc900017efdf8) at mm/swap_state.c:641
#3  0xffffffff81308fcf in swapin_readahead (entry=..., entry@entry=..., gfp_mask=gfp_mask@entry=1051850, vmf=vmf@entry=0xffffc900017efdf8) at mm/swap_state.c:855
#4  0xffffffff812d834c in do_swap_page (vmf=vmf@entry=0xffffc900017efdf8) at mm/memory.c:3822
#5  0xffffffff812dcb2c in handle_pte_fault (vmf=0xffffc900017efdf8) at mm/memory.c:4959
#6  __handle_mm_fault (vma=vma@entry=0xffff88816fd49000, address=address@entry=139841821999104, flags=flags@entry=596) at mm/memory.c:5097
#7  0xffffffff812dd600 in handle_mm_fault (vma=0xffff88816fd49000, address=address@entry=139841821999104, flags=flags@entry=596, regs=regs@entry=0xffffc900017eff58) at mm/memory.c:5218
#8  0xffffffff810f3ca3 in do_user_addr_fault (regs=regs@entry=0xffffc900017eff58, error_code=error_code@entry=4, address=address@entry=139841821999104) at arch/x86/mm/fault.c:1428
#9  0xffffffff81fa7e22 in handle_page_fault (address=139841821999104, error_code=4, regs=0xffffc900017eff58) at arch/x86/mm/fault.c:1519
#10 exc_page_fault (regs=0xffffc900017eff58, error_code=4) at arch/x86/mm/fault.c:1575
#11 0xffffffff82000b62 in asm_exc_page_fault () at ./arch/x86/include/asm/idtentry.h:570
#12 0x0000000000000008 in fixed_percpu_data ()

其中会调用的

mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);

workingset_eviction

#0  workingset_eviction (folio=folio@entry=0xffffea00060c7fc0, target_memcg=target_memcg@entry=0xffff88816a1ab000) at mm/workingset.c:353
#1  0xffffffff812ab66d in __remove_mapping (mapping=0xffff8881229ac000, folio=folio@entry=0xffffea00060c7fc0, reclaimed=reclaimed@entry=true, target_memcg=0xffff88816a1ab000) at mm/vmscan.c:1351
#2  0xffffffff812adb6a in shrink_folio_list (folio_list=folio_list@entry=0xffffc900017ef928, pgdat=pgdat@entry=0xffff88823fff9000, sc=sc@entry=0xffffc900017efb00, stat=stat@entry=0xffffc900017ef9b0, ignore_references=ignore_references@entry=false) at mm/vmscan.c:2016
#3  0xffffffff812af3b8 in shrink_inactive_list (lru=LRU_INACTIVE_ANON, sc=0xffffc900017efb00, lruvec=0xffff88816a18d800, nr_to_scan=<optimized out>) at mm/vmscan.c:2489
#4  shrink_list (sc=0xffffc900017efb00, lruvec=0xffff88816a18d800, nr_to_scan=<optimized out>, lru=LRU_INACTIVE_ANON) at mm/vmscan.c:2716

总结,大致了解,但是不理解的是,shadow 这个结构体,之前都释放了,之后重新获取到的,都不是一个 page ,为什么有意义吗? add_to_swap_cache 中获取的逻辑

	do {
		xas_lock_irq(&xas);
		xas_create_range(&xas);
		if (xas_error(&xas))
			goto unlock;
		for (i = 0; i < nr; i++) {
			VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
			old = xas_load(&xas);
			if (xa_is_value(old)) {
				if (shadowp)
					*shadowp = old;
			}
			set_page_private(folio_page(folio, i), entry.val + i);
			xas_store(&xas, folio);
			xas_next(&xas);
		}
		address_space->nrpages += nr;
		__node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
		__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
unlock:
		xas_unlock_irq(&xas);
	} while (xas_nomem(&xas, gfp));

这里到底在查询什么?

问了下 chatgpt !

/proc/vmstat is a file in the Linux proc filesystem that provides various statistics about the virtual memory subsystem. It includes counters related to memory management, such as page allocation, swapping, and page faults.

workingset_nodes is one of the counters in /proc/vmstat, and it refers to the number of shadow nodes created for the working set management mechanism. In the context of Linux kernel memory management, the working set refers to the set of pages that are actively in use by a process or the system at a given time.

The Linux kernel uses a technique called "shadow nodes" to track the working set of processes more efficiently. A shadow node represents a part of a process's address space that has been evicted from memory but is still being tracked. The kernel maintains these nodes to make better decisions about which pages to evict from memory when under memory pressure.

When a page is evicted, its corresponding entry in the process's page table is replaced with a "shadow entry," and a shadow node is created to store information about the evicted page. If the process later tries to access the evicted page, a page fault occurs, and the kernel can use the information in the shadow node to decide whether to bring the page back into memory or to evict another page.

The workingset_nodes counter in /proc/vmstat provides the total number of shadow nodes created since the system was booted. This number can be useful to understand how often pages are being evicted and whether the system is under memory pressure.

Keep in mind that the workingset_nodes counter may not be present on all systems, as it depends on the kernel configuration and version. In some cases, you may need to enable the CONFIG_WORKINGSET kernel configuration option to see this counter.

例如这个

snapshot_refaults 做啥的

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。