Skip to the content.

shmem

KeyNote

[ ] 验证一下这个说法

[ ] 观察下 tmpfs 的 mount option

  1. 增加上 noswap 之后,结果会如何?

inode operations

shmem_inode_operations 应该是给 file 使用的,所以创建文件之类的操作都是没有的:

#0  shmem_create (mnt_userns=0xffffffff82a61920 <init_user_ns>, dir=0xffff8881212a8390, dentry=0xffff888122a2c780, mode=33206, excl=false) at mm/shmem.c:2952
#1  0xffffffff8135a878 in lookup_open (op=0xffffc900017bbedc, op=0xffffc900017bbedc, got_write=true, file=0xffff8881262d0300, nd=0xffffc900017bbdc0) at fs/namei.c:3413
#2  open_last_lookups (op=0xffffc900017bbedc, file=0xffff8881262d0300, nd=0xffffc900017bbdc0) at fs/namei.c:3481
#3  path_openat (nd=nd@entry=0xffffc900017bbdc0, op=op@entry=0xffffc900017bbedc, flags=flags@entry=65) at fs/namei.c:3688
#4  0xffffffff8135b9ed in do_filp_open (dfd=dfd@entry=-100, pathname=pathname@entry=0xffff888100f3c000, op=op@entry=0xffffc900017bbedc) at fs/namei.c:3718
#5  0xffffffff813455b5 in do_sys_openat2 (dfd=-100, filename=<optimized out>, how=how@entry=0xffffc900017bbf18) at fs/open.c:1311
#6  0xffffffff81345aae in do_sys_open (mode=<optimized out>, flags=<optimized out>, filename=<optimized out>, dfd=<optimized out>) at fs/open.c:1327
#7  __do_sys_openat (mode=<optimized out>, flags=<optimized out>, filename=<optimized out>, dfd=<optimized out>) at fs/open.c:1343
#8  __se_sys_openat (mode=<optimized out>, flags=<optimized out>, filename=<optimized out>, dfd=<optimized out>) at fs/open.c:1338
#9  __x64_sys_openat (regs=<optimized out>) at fs/open.c:1338
#10 0xffffffff81edbcf8 in do_syscall_x64 (nr=<optimized out>, regs=0xffffc900017bbf58) at arch/x86/entry/common.c:50
#0  shmem_mkdir (mnt_userns=0xffffffff82a61920 <init_user_ns>, dir=0xffff8881212a8390, dentry=0xffff888121a36000, mode=511) at mm/shmem.c:2942
#1  0xffffffff81356eec in vfs_mkdir (mnt_userns=0xffffffff82a61920 <init_user_ns>, dir=0xffff8881212a8390, dentry=dentry@entry=0xffff888121a36000, mode=<optimized out>, mode@entry=511) at fs/namei.c:4013
#2  0xffffffff8135bcf1 in do_mkdirat (dfd=dfd@entry=-100, name=0xffff88822120d000, mode=mode@entry=511) at fs/namei.c:4038
#3  0xffffffff8135bee3 in __do_sys_mkdir (mode=<optimized out>, pathname=<optimized out>) at fs/namei.c:4058
#4  __se_sys_mkdir (mode=<optimized out>, pathname=<optimized out>) at fs/namei.c:4056
#5  __x64_sys_mkdir (regs=<optimized out>) at fs/namei.c:4056
#6  0xffffffff81edbcf8 in do_syscall_x64 (nr=<optimized out>, regs=0xffffc900017cbf58) at arch/x86/entry/common.c:50
#7  do_syscall_64 (regs=0xffffc900017cbf58, nr=<optimized out>) at arch/x86/entry/common.c:80

因为 tmpfs 的 inode 信息都是不能写回的,所以必然需要额外的一个机制来处理 inode 的。

unsigned long shmem_get_unmapped_area(struct file *file, // todo 并不知道其作用是什么 ?
				      unsigned long uaddr, unsigned long len,
				      unsigned long pgoff, unsigned long flags)
	get_area = current->mm->get_unmapped_area; // 实际上的工作被 mm_struct 的 get_unmapped_area

static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) // todo 取决于 whence 调用下面两个函数
    1. loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof)
    2. shmem_seek_hole_data : whence != SEEK_DATA && whence != SEEK_HOLE // todo 感觉可以 man 到 SEEK_HOLE 和 SEEK_DATA

file operations

和 huge 的关系

只是和 transparent page 有关:

如何利用 shmem 实现 tmpfs 的

需要额外支持:

其实,主要需要构建文件的操作,如果不需要 tmpfs 的结构,那就是 shm 的行为,所有数据都在一起。

struct address_space_operations shmem_aops

static const struct address_space_operations shmem_aops = {
	.writepage	= shmem_writepage,
	.set_page_dirty	= __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
	.write_begin	= shmem_write_begin,
	.write_end	= shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
	.migratepage	= migrate_page, // 通用函数
#endif
	.error_remove_page = generic_error_remove_page, // 通用函数
};
  1. shmem_writepage : used for wirte to swap cache ! In fact, the swap cache looks like regular file
  2. shmem_write_begin, shmem_write_end : it work with generic_file_write_iter which is assigned to shmem_file_operations::write_iter
    1. shmem_write_begin : shmem_getpage
    2. shmem_write_end : do something clean up :
    3. in generic_perform_write, between shmem_write_begin and shmem_write_end, iov_iter_copy_from_user_atomic should be mentioned ! @todo

具体的读写并不是在此处

  1. shmem_write_begin 检查 flag
  2. shmem_write_end
    1. set_page_dirty
    2. put_page @todo 似乎是 swap.c 中间修改的 page 状态,和 lruvec 有关的

swap

和 sysv share memory 的关系

#0  shmem_get_inode (sb=0xffff888100058800, dir=dir@entry=0x0 <fixed_percpu_data>, mode=mode@entry=33279, dev=dev@entry=0, flags=flags@entry=0) at mm/shmem.c:156
#1  0xffffffff8129a59f in __shmem_file_setup (i_flags=512, flags=0, size=4096, name=0xffffc90001843e4b "SYSV00000000", mnt=0xffff8882000b4020) at mm/shmem.c:4162
#2  __shmem_file_setup (mnt=0xffff8882000b4020, name=0xffffc90001843e4b "SYSV00000000", size=4096, flags=0, i_flags=512) at mm/shmem.c:4147
#3  0xffffffff815a8b6a in newseg (ns=0xffffffff82bef240 <init_ipc_ns>, params=<optimized out>) at ipc/shm.c:751
#4  0xffffffff815a092d in ipcget_new (params=0xffffc90001843f20, ops=0xffffffff82474120 <shm_ops>, ids=0xffffffff82bef3f0 <init_ipc_ns+432>, ns=0xffffffff82bef240 <init_ipc_ns>) at ipc/util.c:345
#5  ipcget (ns=0xffffffff82bef240 <init_ipc_ns>, ids=0xffffffff82bef3f0 <init_ipc_ns+432>, ops=ops@entry=0xffffffff82474120 <shm_ops>, params=params@entry=0xffffc90001843f20) at ipc/util.c:677
#6  0xffffffff815a85f7 in ksys_shmget (shmflg=<optimized out>, size=<optimized out>, key=<optimized out>) at ipc/shm.c:831
#7  __do_sys_shmget (shmflg=<optimized out>, size=<optimized out>, key=<optimized out>) at ipc/shm.c:836
#8  __se_sys_shmget (shmflg=<optimized out>, size=<optimized out>, key=<optimized out>) at ipc/shm.c:834
#9  __x64_sys_shmget (regs=<optimized out>) at ipc/shm.c:834
#10 0xffffffff81edbcf8 in do_syscall_x64 (nr=<optimized out>, regs=0xffffc90001843f58) at arch/x86/entry/common.c:50
#0  shmem_kernel_file_setup (name=0xffffc90001c3be4b "SYSV00000000", size=4096, flags=0) at mm/shmem.c:4192
#1  0xffffffff815a8b6a in newseg (ns=0xffffffff82bef240 <init_ipc_ns>, params=<optimized out>) at ipc/shm.c:751
#2  0xffffffff815a092d in ipcget_new (params=0xffffc90001c3bf20, ops=0xffffffff82474120 <shm_ops>, ids=0xffffffff82bef3f0 <init_ipc_ns+432>, ns=0xffffffff82bef240 <init_ipc_ns>) at ipc/util.c:345
#3  ipcget (ns=0xffffffff82bef240 <init_ipc_ns>, ids=0xffffffff82bef3f0 <init_ipc_ns+432>, ops=ops@entry=0xffffffff82474120 <shm_ops>, params=params@entry=0xffffc90001c3bf20) at ipc/util.c:677
#4  0xffffffff815a85f7 in ksys_shmget (shmflg=<optimized out>, size=<optimized out>, key=<optimized out>) at ipc/shm.c:831
#5  __do_sys_shmget (shmflg=<optimized out>, size=<optimized out>, key=<optimized out>) at ipc/shm.c:836
#6  __se_sys_shmget (shmflg=<optimized out>, size=<optimized out>, key=<optimized out>) at ipc/shm.c:834
#7  __x64_sys_shmget (regs=<optimized out>) at ipc/shm.c:834
#8  0xffffffff81edbcf8 in do_syscall_x64 (nr=<optimized out>, regs=0xffffc90001c3bf58) at arch/x86/entry/common.c:50
#9  do_syscall_64 (regs=0xffffc90001c3bf58, nr=<optimized out>) at arch/x86/entry/common.c:80
#10 0xffffffff8200009b in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120

shmem

1 When pages within a VMA are backed by a file on disk, the interface used is straight-forward. To read a page during a page fault, the required nopage() function is found vm_area_struct->vm_ops. To write a page to backing storage, the appropriate writepage() function is found in the address_space_operations via inode->i_mapping→a_ops or alternatively via page->mapping->a_ops. When normal file operations are taking place such as mmap(), read() and write(), the struct file_operations with the appropriate functions is found via inode->i_fop and so on. These relationships were illustrated in Figure 4.2.

This is a very clean interface that is conceptually easy to understand but it does not help anonymous pages as there is no file backing. To keep this nice interface, Linux creates an artifical file-backing for anonymous pages using a RAM-based filesystem where each VMA is backed by a “file” in this filesystem.

总结:

  1. 为了 tmpfs 建立的配套机制
  2. fallocate : hole
  3. 和 swap 的紧密联系
  4. transparent huge page

问题 1: shmem 和 swap 的联系有哪些 ?

  1. shmem_swapin_page : 如果 lookup_swap_cache 找不到,那么 shmem_swapin,找到 shmem_add_to_page_cache + delete_from_swap_cache

问题 2: shmem 上是如何构建 /tmp 的 ? 问题 3: shmem 定义了大量齐全的文件系统的接口,为什么是这样的 ?

问题 4: 为什么 minfs 和 myfs 都是没有注册 vm_operations_struct 的,但是依旧可以正常的工作 ? 是不是因为 vm_operations_struct 仅仅限于 mmap 以及其延伸的 page fault ?

并不是,使用的是 generic_file_mmap,所以整个机制都是采用

问题 6: sysv 和 posix 如何利用 shmem 实现的 ? 问题 7: 是不是 ramfs 和 shmem 的唯一区别在于,ramfs 不会将其数据备份到 swap 中间 ? 比较一下 ramfs 和 getpage 和 shmem_getpage !

问题分析 1: pgfault 的流程,由于 page fault 不需要访问磁盘,所以其过程只是需要分配 page 物理页面即可。

  1. shmem_falloc // TODO 有点难以理解
  2. shmem_getpage_gfp - find page in cache, or get from swap, or allocate.

问题分析 2: 为什么 shmem 依旧需要 page cache ? 因为从一般来说,page cache 用于加速访问磁盘,可以 shmem 是基于内存的呀 ? 需要 page cache 提供的基础设施,比如两个进程的 vma 映射了同一个 /tmp/a.md 的内容,那么第一个 page fault 创建了文件,第二个就可以从 page cache 提供的 radix tree 中间找到需要的 page 如果不需要加速访问,那么提供一个一个蛇皮的 file_operations 和 address_space_operations,不进行 page writeback 操作即可。 shmem_writepage : get_swap_page 获取 swp_entry_t,将 page 和 swp_entry_t 添加到 add_to_swap_cache,并且调用 swap_writepage 将其写入到 swap 中间。(这么说,swap 可以实现 /tmp 的内容永久存在)

static const struct address_space_operations shmem_aops = {
    // TODO 为什么没有 readpage ?
  .writepage  = shmem_writepage,
  .set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
  // 为了实现 generic_file_write_iter,在进行拷贝前后使用,用于从 page cache 中间找到正确的 page
  .write_begin  = shmem_write_begin, // shmem_getpage
  .write_end  = shmem_write_end, // SetPageUptodate set_page_dirty
#endif
#ifdef CONFIG_MIGRATION
  .migratepage  = migrate_page,
#endif
  .error_remove_page = generic_error_remove_page,
};

// shmem 的基础配置可以实现什么功能 ?
// posix 以及 sysv  的 shmem,但是它们是靠什么函数进行 IO 的 ?
// 难道 /tmp 和 ramfs 的功能不是重复的吗 ? line 4085 的 CONFIG_SHMEM 似乎说明了很多东西
static const struct file_operations shmem_file_operations = {
  .mmap   = shmem_mmap,
  .get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
  .llseek   = shmem_file_llseek,
  .read_iter  = shmem_file_read_iter,// shmem_getpage + copy_page_to_iter
  .write_iter = generic_file_write_iter, // 就是通用的写操作
  .fsync    = noop_fsync,
  .splice_read  = generic_file_splice_read,
  .splice_write = iter_file_splice_write,
  .fallocate  = shmem_fallocate, // TODO 又是这个
#endif
};

总结,shmem_getpage 是核心,read 使用从 page cache 或者 swap cache ,甚至 swap 中间找。

问题分析 3: tmpfs 的文件操作,看上去和 ext2 没有什么区别啊!

Huge page is represented by HPAGE_PMD_NR entries in radix-tree.

Understanding the Linux Virtual Memory Management

ch12 Shared Memory Virtual Filesystem

Sharing a region region of memory backed by a file or device is simply a case of calling mmap() with the MAP_SHARED flag. However, there are two important cases where an anonymous region needs to be shared between processes. The first is when mmap() with MAP_SHARED but no file backing. These regions will be shared between a parent and child process after a fork() is executed. The second is when a region is explicitly setting them up with shmget() and attached to the virtual address space with shmat().

@todo shmat and shmget, learn it with tlpi !

Every inode in the filesystem is placed on a linked list called shmem_inodes so that they may always be easily located. This allows the same file-based interface to be used without treating anonymous pages as a special case.

tmpfs is cool : same interface with regular file, and share files

The filesystem comes in two variations called shm and tmpfs. They both share core functionality and mainly differ in what they are used for. shm is for use by the kernel for creating file backings for anonymous pages and for backing regions created by shmget(). This filesystem is mounted by kern_mount() so that it is mounted internally and not visible to users. tmpfs is a temporary filesystem that may be optionally mounted on /tmp/ to have a fast RAM-based temporary filesystem. A secondary use for tmpfs is to mount it on /dev/shm/. Processes that mmap() files in the tmpfs filesystem will be able to share information between them as an alternative to System V IPC mechanisms. Regardless of the type of use, tmpfs must be explicitly mounted by the system administrator.

shm and tmpfs : shm : shmget(), kern_mount, not visible to users tmpfs : mounted on /tmp and /dev/shm

12.1 Initialising the Virtual Filesystem

  1. init_tmpfs : replace with init_shmem
  2. struct shmem_inode_info

12.2 Using shmem Functions

introduce all the operations

Different structs contain pointers for shmem specific functions. In all cases, tmpfs and shm share the same structs.

@todo where is the evidence of tmpfs and shm

For faulting in pages and writing them to backing storage, two structs called shmem_aops and shmem_vm_ops of type struct address_space_operations and struct vm_operations_struct respectively are declared.

12.3 Creating Files in tmpfs

12.4 Page Faulting within a Virtual File

12.4.1 Locating Swapped Pages
12.4.2 Writing Pages to Swap

what’s framework : shmem_getpage 1. find_lock_entry 2. shmem_swapin_page 3. shmem_alloc_and_acct_page

  1. shmem_aops doesn’t contains the readpage, but there are shmem_writepage, it’s asymetric ?
    1. pgfault happens for mapped area
    2. even file in /tmp may mmap(2) into user space, that why we need shmem_pg
    3. shmem_writepage is needed because page reclaim
    4. what if a page is reclaimed and swapped out into the cache ? the swap in will be processed by the shmem_getpage !

12.5 File Operations in tmpfs

easy

12.6 Inode Operations in tmpfs

inode_operations doesn’t contains shmem_truncate()

12.7 Setting up Shared Regions

验证一下这个项目

https://unix.stackexchange.com/questions/348464/if-i-mmap-a-file-from-tmpfs-will-it-double-the-memory-usage

shmem_mmap

static const struct file_operations shmem_file_operations = {
	.mmap		= shmem_mmap,

根据 nlink ,采取分别注册如下不同的 hook

static const struct vm_operations_struct shmem_vm_ops = {
	.fault		= shmem_fault,
	.map_pages	= filemap_map_pages,
#ifdef CONFIG_NUMA
	.set_policy     = shmem_set_policy,
	.get_policy     = shmem_get_policy,
#endif
};

static const struct vm_operations_struct shmem_anon_vm_ops = {
	.fault		= shmem_fault,
	.map_pages	= filemap_map_pages,
#ifdef CONFIG_NUMA
	.set_policy     = shmem_set_policy,
	.get_policy     = shmem_get_policy,
#endif
};

内容完全相同,只是为了名称

bool vma_is_anon_shmem(struct vm_area_struct *vma)
{
	return vma->vm_ops == &shmem_anon_vm_ops;
}
History:        #0
Commit:         d09e8ca6cb93bb4b97517a18fbbf7eccb0e9ff43
Author:         Pasha Tatashin <pasha.tatashin@soleen.com>
Committer:      Andrew Morton <akpm@linux-foundation.org>
Author Date:    2022年11月15日 星期二 10时06分01秒
Committer Date: 2022年12月01日 星期四 07时58分55秒

mm: anonymous shared memory naming

Since commit 9a10064f5625 ("mm: add a field to store names for private
anonymous memory"), name for private anonymous memory, but not shared
anonymous, can be set.  However, naming shared anonymous memory just as
useful for tracking purposes.

Extend the functionality to be able to set names for shared anon.

There are two ways to create anonymous shared memory, using memfd or
directly via mmap():
1. fd = memfd_create(...)
   mem = mmap(..., MAP_SHARED, fd, ...)
2. mem = mmap(..., MAP_SHARED | MAP_ANONYMOUS, -1, ...)

In both cases the anonymous shared memory is created the same way by
mapping an unlinked file on tmpfs.

The memfd way allows to give a name for anonymous shared memory, but
not useful when parts of shared memory require to have distinct names.

Example use case: The VMM maps VM memory as anonymous shared memory (not
private because VMM is sandboxed and drivers are running in their own
processes).  However, the VM tells back to the VMM how parts of the memory
are actually used by the guest, how each of the segments should be backed
(i.e.  4K pages, 2M pages), and some other information about the segments.
The naming allows us to monitor the effective memory footprint for each
of these segments from the host without looking inside the guest.

Sample output:
  /* Create shared anonymous segmenet */
  anon_shmem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
  /* Name the segment: "MY-NAME" */
  rv = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME,
             anon_shmem, SIZE, "MY-NAME");

cat /proc/<pid>/maps (and smaps):
7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 [anon_shmem:MY-NAME]

If the segment is not named, the output is:
7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 /dev/zero (deleted)

Link: https://lkml.kernel.org/r/20221115020602.804224-1-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Hildenbrand <david@redhat.com>

[x] 为什么 shmem 需要注册 noop_dirty_folio

基本的疑问是,shmem 应该是上的所有的

因为,当 shmem 放到 swap cache 中的时候,他是可以不用 dirty 的

证据是,write_cache_pages 的时候,首先会清理掉 dirty ,然后

当 shmem_writepage 失败之后,那么重新将 page 设置为 dirty 的

什么时候会调用 shmem_writepage

既然明明知道 shmem 是无法调用下去的,为什么还是在存在写

跟踪 shmem_writepage 的调用者

  1. vmscan 中, pageout 的时候,调用 folio_clear_dirty_for_io ,这是唯一有用的地方
  2. migration 中
    • fallback_migrate_folio
    • writeout
      • address_space_operations::writepage 但是这里不会调用到 shmem_writepage ,因为这里是如果没有注册 migrate_folio, 可以不用搬移,直接写回去,对于文件系统而言,这个非常常见
  3. 为什么 fsync 的系统调用会放过 shmem ?
  1. 为什么周期性的 wirteback 会放过 shmem ?
    • 那是根据 disk 来的,但是 shmem 没有 disk 的

shmem_writepage 和 shmem_write_end 的区别

#0  shmem_write_end (file=0xffff88810e4a7b00, mapping=0xffff88810e97e100, pos=0, len=230, copied=230, page=0xffffea000447bd80, fsdata=0x0 <fixed_percpu_data>) at mm/shmem.c:2593
#1  0xffffffff81346243 in generic_perform_write (iocb=0xffffc90004fffe98, i=0xffffc90004fffe70) at mm/filemap.c:3934
#2  0xffffffff8134c9cd in __generic_file_write_iter (iocb=iocb@entry=0xffffc90004fffe98, from=from@entry=0xffffc90004fffe70) at mm/filemap.c:4018
#3  0xffffffff8134ca45 in generic_file_write_iter (iocb=0xffffc90004fffe98, from=0xffffc90004fffe70) at mm/filemap.c:4044
#4  0xffffffff81442bfa in call_write_iter (iter=0xffff88810e97e100, kio=0xffffc90004fffe98, file=0xffff88810e4a7b00) at ./include/linux/fs.h:1877

参考 Documentation/filesystems/vfs.rst

一个细节问题

在 collapse_file() 中

	if (is_shmem)
		folio_mark_dirty(folio);

最开始是:commit 99cb0dbd47a1 (“mm,thp: add read-only THP support for (non-shmem) FS”)

这个路径真的经典啊:

[2080382.104706] Call Trace:
[2080382.104709]  <TASK>
[2080382.104711]  dump_stack_lvl+0x91/0xb0
[2080382.104716]  dump_header+0x44/0x1b0
[2080382.104719]  oom_kill_process+0xfa/0x200
[2080382.104721]  out_of_memory+0x248/0x580
[2080382.104724]  mem_cgroup_out_of_memory+0x12a/0x140
[2080382.104727]  try_charge_memcg+0x4ba/0x640
[2080382.104729]  __mem_cgroup_charge+0x42/0xb0
[2080382.104731]  shmem_alloc_and_add_folio+0xe7/0x500
[2080382.104733]  ? filemap_get_entry+0x10f/0x190
[2080382.104735]  shmem_get_folio_gfp+0x3a9/0x660
[2080382.104737]  shmem_fault+0x88/0x2f0
[2080382.104739]  ? inode_needs_update_time+0x4b/0xc0
[2080382.104741]  ? balance_dirty_pages_ratelimited_flags+0x21/0x3a0
[2080382.104743]  __do_fault+0x32/0x120
[2080382.104745]  do_fault+0xb7/0x430
[2080382.104747]  __handle_mm_fault+0x7dc/0x1050
[2080382.104749]  ? __pte_offset_map_lock+0x9e/0x110
[2080382.104751]  handle_mm_fault+0x17f/0x2e0
[2080382.104753]  __get_user_pages+0x1bb/0x780
[2080382.104756]  get_user_pages_unlocked+0xe6/0x380
[2080382.104759]  hva_to_pfn+0xbc/0x580 [kvm]
[2080382.104801]  kvm_faultin_pfn+0x10e/0x610 [kvm]
[2080382.104841]  kvm_tdp_page_fault+0x91/0xe0 [kvm]
[2080382.104880]  kvm_mmu_do_page_fault+0x1e6/0x230 [kvm]
[2080382.104918]  kvm_mmu_page_fault+0x82/0x620 [kvm]
[2080382.104955]  ? irqentry_enter+0x23/0x60
[2080382.104957]  ? sysvec_call_function+0xe/0x80
[2080382.104961]  vmx_handle_exit+0x12c/0x960 [kvm_intel]
[2080382.104970]  kvm_arch_vcpu_ioctl_run+0x6e5/0x1470 [kvm]
[2080382.105010]  ? preempt_count_sub+0x4b/0x60
[2080382.105012]  ? _raw_spin_unlock_irq+0x24/0x40
[2080382.105015]  kvm_vcpu_ioctl+0x22e/0x980 [kvm]
[2080382.105043]  __x64_sys_ioctl+0x94/0xd0
[2080382.105046]  do_syscall_64+0xc1/0x210
[2080382.105048]  entry_SYSCALL_64_fter_hwframe+0x77/0x7f

kvm -> shmem -> cgroup -> oom

原来 memfd 也是 shmem 的一部分

[<0>] handle_userfault+0x447/0x8f0
[<0>] shmem_get_folio_gfp+0x3b3/0x610
[<0>] shmem_fault+0x86/0x300
[<0>] __do_fault+0x30/0x180
[<0>] do_fault+0xbe/0x4d0
[<0>] __handle_mm_fault+0x7d1/0xfe0
[<0>] handle_mm_fault+0x17f/0x2e0
[<0>] __get_user_pages+0x23d/0x1410
[<0>] get_user_pages_unlocked+0xe6/0x390
[<0>] hva_to_pfn+0x2bd/0x400 [kvm]
[<0>] __kvm_faultin_pfn+0x62/0xa0 [kvm]
[<0>] kvm_mmu_faultin_pfn+0x27b/0x6b0 [kvm]
[<0>] kvm_tdp_page_fault+0x97/0xf0 [kvm]
[<0>] kvm_mmu_do_page_fault+0x1ec/0x240 [kvm]
[<0>] kvm_mmu_page_fault+0x82/0x6f0 [kvm]
[<0>] vmx_handle_exit+0x21a/0x880 [kvm_intel]
[<0>] vcpu_enter_guest.constprop.0+0x64d/0x1270 [kvm]
[<0>] kvm_arch_vcpu_ioctl_run+0x357/0x6d0 [kvm]
[<0>] kvm_vcpu_ioctl+0x122/0xa20 [kvm]
[<0>] __x64_sys_ioctl+0xa0/0xe0
[<0>] do_syscall_64+0xc1/0x220
[<0>] entry_SYSCALL_64_after_hwframe+0x77/0x7f

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。

  1. https://www.kernel.org/doc/gorman/html/understand/understand015.html  2