Page Writeback
- page cache only
- echo 1 > /proc/sys/vm/drop_caches
- inode / dentry
- echo 2 > /proc/sys/vm/drop_caches
- all
- echo 3 > /proc/sys/vm/drop_caches
ref && doc
[x] fs-writeback 和 page-writeback 的侧重
fs-writeback 侧重整个文件系统的写回。 page-writeback
此处不处理具体的写回操作,谢谢
- 那么具体的写回放到哪里的?
domain node global
对于 CPU 含有 domain,对于 memory 也是含有 domain 的概念。
labtop mode
policy
/**
* balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* On really big machines, get_writeback_state is expensive, so try to avoid
* calling it too often (ratelimiting). But once we're over the dirty memory
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
// 1. balance_dirty_pages : 300 行的大函数
// 2.
ratelimit_pages
两个 ref 位置 :
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
* If it is too low then SMP machines will call the (expensive)
* get_writeback_state too often.
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
* thresholds.
*/
void writeback_set_ratelimit(void)
{
struct wb_domain *dom = &global_wb_domain; // todo
unsigned long background_thresh;
unsigned long dirty_thresh;
global_dirty_limits(&background_thresh, &dirty_thresh);
dom->dirty_limit = dirty_thresh;
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
}
/**
* tag_pages_for_writeback - tag pages to be written by write_cache_pages
* @mapping: address space structure to write
* @start: starting page index
* @end: ending page index (inclusive)
*
* This function scans the page range from @start to @end (inclusive) and tags
* all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
* that write_cache_pages (or whoever calls this function) will then use
* TOWRITE tag to identify pages eligible for writeback. This mechanism is
* used to avoid livelocking of writeback by a process steadily creating new
* dirty pages in the file (thus it is important for this function to be quick
* so that it can tag pages faster than a dirtying process can create them).
*/
/*
* We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
* latency.
*/
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
// write_cache_pages ?
/*
* Dirty a page.
*
* For pages with a mapping this should be done under the page lock
* for the benefit of asynchronous memory errors who prefer a consistent
* dirty state. This rule can be broken in some special cases,
* but should be better not to.
*
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
int set_page_dirty(struct page *page)
// 似乎这是全部的内容了
和外部交互的分析
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
if (wbc->nr_to_write <= 0)
return 0;
while (1) {
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
break;
cond_resched();
congestion_wait(BLK_RW_ASYNC, HZ/50);
}
return ret;
}
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data)
分析各种注册到 address_space 上的 writepage !
// ext2 : 依赖于公共框架
static int ext2_readpage(struct file *file, struct page *page)
{
// 原来居然只是 read 走 mpage,但是write 在buffer.c 中间
return mpage_readpage(page, ext2_get_block);
}
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
// 其实这个就让人觉得特别奇怪
// TODO 如果 page cache 连接到 ext2_writepage 中间,但是现在又依赖于 buffer 机制
return block_write_full_page(page, ext2_get_block, wbc);
}
// ext4 : 直接进入 block 层,自己实现。
static int ext4_readpage(struct file *file, struct page *page)
{
int ret = -EAGAIN;
struct inode *inode = page->mapping->host;
trace_ext4_readpage(page);
if (ext4_has_inline_data(inode))
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
return ext4_mpage_readpages(page->mapping, NULL, page, 1,
false);
return ret;
}
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead)
// 一个非常长的函数,直接进入到 block layer 层次
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
// 也是 ext4 自己的负责的内容
// 进一步分析 ext4_writepages 和 ext4_readpages,前者其实依赖 generic_writepages 然后反复调用自己的 writepage 函数
// 但是后者没有 generic 这一部分,因为写会依赖于 page cache 中间的 radix tree 的功能,但是 read 不需要。
// swap : page_io.c 中间实现。
// 总体来说,ext2 很奇怪啊 !
分析一下各种
// 所以,其实奇怪的地方不在于 address_space 而是在于为什么ext4 需要同时注册
// address_space(没有问题) 和 file_operations(有点问题, 似乎过于high level了,
// 越过了page cache)
// 如果 ext4 中间不持有 file_operations 我猜测 vfs 的设计如何进行 ?
// 显然不是为了 dax 机制,否则必定含有更简单的方法 ?
// ext2 : 依赖于filemap.c
static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
#ifdef CONFIG_FS_DAX
if (IS_DAX(iocb->ki_filp->f_mapping->host))
return ext2_dax_read_iter(iocb, to);
#endif
return generic_file_read_iter(iocb, to);
}
static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
#ifdef CONFIG_FS_DAX
if (IS_DAX(iocb->ki_filp->f_mapping->host))
return ext2_dax_write_iter(iocb, from);
#endif
return generic_file_write_iter(iocb, from);
}
// ext4 : 还是依赖于filemap.c
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
if (IS_DAX(file_inode(iocb->ki_filp)))
return ext4_dax_read_iter(iocb, to);
#endif
return generic_file_read_iter(iocb, to);
}
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
// 提供了更多的检查之类的操作,然后调用 __generic_file_write_iter
// 就读写而言,一般就是将工作给page cache 或者 绕过 page cache,已经足够说明需要使用为file_operation 提供fs specific 的file io 了。
// 而且file 的操作不限于io 还有其他的和文件系统相关,所以fs 提供 file_operations 的接口是有必要的。
page_writeback_init
/*
* Called early on to tune the page writeback dirty limits.
*
* We used to scale dirty pages according to how total memory
* related to pages that could be allocated for buffers (by
* comparing nr_free_buffer_pages() to vm_total_pages.
*
* However, that was when we used "dirty_ratio" to scale with
* all memory, and we don't do that any more. "dirty_ratio"
* is now applied to total non-HIGHPAGE memory (by subtracting
* totalhigh_pages from vm_total_pages), and as such we can't
* get into the old insane situation any more where we had
* large amounts of dirty pages compared to a small amount of
* non-HIGHMEM memory.
*
* But we might still want to scale the dirty_ratio by how
* much memory the box has..
*/
void __init page_writeback_init(void)
{
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
page_writeback_cpu_online, NULL);
cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
page_writeback_cpu_online);
}
set_page_dirty : 不是设置一个标志位的事情,而是
- 一般来说调用
__set_page_dirty_buffers完成工作, 同时标记两个内容
set_page_dirtyallows an address space to provide a specific method of marking a page as dirty. However, this option is rarely used. In this case, the kernel automatically uses ccode__set_page_dirty_buffersto simultaneously mark the page as dirty on the buffer level and to add it to thedirty_pageslist of the current mapping.
/*
* Dirty a page.
*
* For pages with a mapping this should be done under the page lock
* for the benefit of asynchronous memory errors who prefer a consistent
* dirty state. This rule can be broken in some special cases,
* but should be better not to.
*
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
int set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
page = compound_head(page);
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
/*
* readahead/lru_deactivate_page could remain
* PG_readahead/PG_reclaim due to race with end_page_writeback
* About readahead, if the page is written, the flags would be
* reset. So no problem.
* About lru_deactivate_page, if the page is redirty, the flag
* will be reset. So no problem. but if the page is used by readahead
* it will confuse readahead and make it restart the size rampup
* process. But it's a trivial problem.
*/
if (PageReclaim(page))
ClearPageReclaim(page);
#ifdef CONFIG_BLOCK
if (!spd)
spd = __set_page_dirty_buffers;
#endif
return (*spd)(page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
return 1;
}
return 0;
}
EXPORT_SYMBOL(set_page_dirty);
test_clear_page_writeback && __test_set_page_writeback
- 实际上的并没有办法区分这三个 flags 的含义是什么 ?
- 实际上,这一个 TAG 是可以选择的,当没有取消掉这一个选项的时候,内容立刻变得非常简单了。
mm: don't use radix tree writeback tags for pages in swap cache
File pages use a set of radix tree tags (DIRTY, TOWRITE, WRITEBACK,
etc.) to accelerate finding the pages with a specific tag in the radix
tree during inode writeback. But for anonymous pages in the swap cache,
there is no inode writeback. So there is no need to find the pages with
some writeback tags in the radix tree. It is not necessary to touch
radix tree writeback tags for pages in the swap cache.
Per Rik van Riel's suggestion, a new flag AS_NO_WRITEBACK_TAGS is
introduced for address spaces which don't need to update the writeback
tags. The flag is set for swap caches. It may be used for DAX file
systems, etc.
With this patch, the swap out bandwidth improved 22.3% (from ~1.2GB/s to
~1.48GBps) in the vm-scalability swap-w-seq test case with 8 processes.
The test is done on a Xeon E5 v3 system. The swap device used is a RAM
simulated PMEM (persistent memory) device. The improvement comes from
the reduced contention on the swap cache radix tree lock. To test
sequential swapping out, the test case uses 8 processes, which
sequentially allocate and write to the anonymous pages until RAM and
part of the swap device is used up.
Link: http://lkml.kernel.org/r/1472578089-5560-1-git-send-email-ying.huang@intel.com
- @todo 所以什么叫做 inode writeback ? radix tree 利用 tag 为什么可以加速查找的速度 ?
static inline int mapping_use_writeback_tags(struct address_space *mapping)
{
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK XA_MARK_1
#define PAGECACHE_TAG_TOWRITE XA_MARK_2
/*
* Bits in mapping->flags.
*/
enum mapping_flags {
AS_EIO = 0, /* IO error on async write */
AS_ENOSPC = 1, /* ENOSPC on async write */
AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */
AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
};
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
struct mem_cgroup *memcg;
struct lruvec *lruvec;
int ret;
memcg = lock_page_memcg(page);
lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
if (ret) {
__xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
dec_wb_stat(wb, WB_WRITEBACK);
__wb_writeout_inc(wb);
}
}
if (mapping->host && !mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK))
sb_clear_inode_writeback(mapping->host);
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = TestClearPageWriteback(page); // 这也太简化了吧!
}
/*
* NOTE: Page might be free now! Writeback doesn't hold a page
* reference on its own, it relies on truncation to wait for
* the clearing of PG_writeback. The below can only access
* page state that is static across allocation cycles.
*/
if (ret) { // todo 处理一下统计数据,虽然不知道为什么 >
dec_lruvec_state(lruvec, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
__unlock_page_memcg(memcg);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
int ret;
lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, page_index(page));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
xas_load(&xas);
ret = TestSetPageWriteback(page);
if (!ret) {
bool on_wblist;
on_wblist = mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
/*
* We can come through here when swapping anonymous
* pages, so we don't necessarily have an inode to track
* for sync.
*/
if (mapping->host && !on_wblist)
sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
} else {
ret = TestSetPageWriteback(page); // 也是非常简化的
}
if (!ret) { // 也是统计数据
inc_lruvec_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
}
unlock_page_memcg(page);
return ret;
}
EXPORT_SYMBOL(__test_set_page_writeback);
do_writepages : 将整个 inode 中间的 dirty page 全部写会
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
if (wbc->nr_to_write <= 0)
return 0;
while (1) {
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc); //
if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
break;
cond_resched();
congestion_wait(BLK_RW_ASYNC, HZ/50);
}
return ret;
}
/**
* generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
*
* This is a library function, which implements the writepages()
* address_space_operation.
*
* Return: %0 on success, negative error code otherwise
*/
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct blk_plug plug;
int ret;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __writepage, mapping);
blk_finish_plug(&plug);
return ret;
}
难道,generic_writepages 是专用的吗?
#0 generic_writepages (wbc=0xffffc9003f807ca0, mapping=0xffff888103d80508) at mm/page-writeback.c:2559
#1 do_writepages (mapping=mapping@entry=0xffff888103d80508, wbc=wbc@entry=0xffffc9003f807ca0) at mm/page-writeback.c:2583
#2 0xffffffff81404b4c in __writeback_single_inode (inode=inode@entry=0xffff888103d80390, wbc=wbc@entry=0xffffc9003f807ca0) at fs/fs-writeback.c:1598
#3 0xffffffff81405304 in writeback_sb_inodes (sb=sb@entry=0xffff888100059800, wb=wb@entry=0xffff88811ce51000, work=work@entry=0xffffc9003f807e30) at fs/fs-writeback.c:1889
#4 0xffffffff814055f7 in __writeback_inodes_wb (wb=wb@entry=0xffff88811ce51000, work=work@entry=0xffffc9003f807e30) at fs/fs-writeback.c:1960
#5 0xffffffff81405872 in wb_writeback (wb=wb@entry=0xffff88811ce51000, work=work@entry=0xffffc9003f807e30) at fs/fs-writeback.c:2065
#6 0xffffffff81406be9 in wb_check_background_flush (wb=0xffff88811ce51000) at fs/fs-writeback.c:2131
#7 wb_do_writeback (wb=0xffff88811ce51000) at fs/fs-writeback.c:2219
#8 wb_workfn (work=0xffff88811ce51188) at fs/fs-writeback.c:2246
#9 0xffffffff8114cd57 in process_one_work (worker=worker@entry=0xffff8881487da3c0, work=0xffff88811ce51188) at kernel/workqueue.c:2289
#10 0xffffffff8114cf7c in worker_thread (__worker=0xffff8881487da3c0) at kernel/workqueue.c:2436
#11 0xffffffff811556c7 in kthread (_create=0xffff8881037f3640) at kernel/kthread.c:376
#12 0xffffffff8100265c in ret_from_fork () at arch/x86/entry/entry_64.S:308
记录一些 backtrace
为什么这里也是存在时间的周期性的 writeback 的操作:
#0 writeout_period (t=0xffffffff838afb18 <global_wb_domain+56>) at mm/page-writeback.c:608
#1 0xffffffff811d7c82 in call_timer_fn (timer=timer@entry=0xffffffff838afb18 <global_wb_domain+56>, fn=fn@entry=0xffffffff812e9980 <writeout_period>, baseclk=baseclk@entry=4296110016) at kernel/time/timer.c:1700
#2 0xffffffff811d7f8e in expire_timers (head=0xffffc90000124f10, base=0xffff888237c9e040) at kernel/time/timer.c:1751
#3 __run_timers (base=0xffff888237c9e040) at kernel/time/timer.c:2022
#4 0xffffffff82197e57 in __do_softirq () at kernel/softirq.c:571
#5 0xffffffff811328aa in invoke_softirq () at kernel/softirq.c:445
#6 __irq_exit_rcu () at kernel/softirq.c:650
#7 0xffffffff82184fc6 in sysvec_apic_timer_interrupt (regs=0xffffc9000009be38) at arch/x86/kernel/apic/apic.c:1107
/proc/sys/vm/ 下的一些接口
DEVICE_ATTR_RW(max_bytes);
static DEVICE_ATTR_RW(strict_limit);
static struct ctl_table vm_page_writeback_sysctls[] = {
{
.procname = "dirty_background_ratio",
.data = &dirty_background_ratio,
.maxlen = sizeof(dirty_background_ratio),
.mode = 0644,
.proc_handler = dirty_background_ratio_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "dirty_background_bytes",
.data = &dirty_background_bytes,
.maxlen = sizeof(dirty_background_bytes),
.mode = 0644,
.proc_handler = dirty_background_bytes_handler,
.extra1 = SYSCTL_LONG_ONE,
},
{
.procname = "dirty_ratio",
.data = &vm_dirty_ratio,
.maxlen = sizeof(vm_dirty_ratio),
.mode = 0644,
.proc_handler = dirty_ratio_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname = "dirty_bytes",
.data = &vm_dirty_bytes,
.maxlen = sizeof(vm_dirty_bytes),
.mode = 0644,
.proc_handler = dirty_bytes_handler,
.extra1 = (void *)&dirty_bytes_min,
},
{
.procname = "dirty_writeback_centisecs",
.data = &dirty_writeback_interval,
.maxlen = sizeof(dirty_writeback_interval),
.mode = 0644,
.proc_handler = dirty_writeback_centisecs_handler,
},
{
.procname = "dirty_expire_centisecs",
.data = &dirty_expire_interval,
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "laptop_mode",
.data = &laptop_mode,
.maxlen = sizeof(laptop_mode),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{}
};
systeroid –tui 中搜索 dirty 看看
似乎是存在 background 和非 background 的两个部分
sync 命令和 echo 3 | sudo tee /proc/sys/vm/drop_caches 的区别
- https://stackoverflow.com/questions/17500134/is-running-sync-before-drop-caches-necessary
drop cache 删除掉 clean cache sync 是将 dirty page 写下去
- 分别检查下代码实现吧
[ ] cat /proc/meminfo 中这两个调查下
Writeback: 0 kB
Dirty: 728 kB
WritebackTmp: 0 kB
为什么有的用户可以完全不注册 noop_dirty_folio
在 commit b82a96c92533 (“fs: remove noop_set_page_dirty()”) 中 commit log 中:
It will have no effect on actually writing the page back, as the pages are not on any LRU lists.
简单的使用这个方法
ag -A 10 "const struct address_space_operations.*\{" > a.txt
- romfs_aops
- squashfs_symlink_aops
- squashfs_symlink_inode_ops
- squashfs_aops
- udf_symlink_aops
- efs_aops
- fuse_symlink_aops
- efs_aops
大致的样子:
static const struct address_space_operations ovl_aops = {
/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
.direct_IO = noop_direct_IO,
};
const struct address_space_operations nfs_dir_aops = {
.free_folio = nfs_readdir_clear_array,
};
看来,如果是 readonly 的,symlink 这种根本不会管理具体的 page 的都是没必要
[x] aops 可以一并删除吗
- fb_deferred_io_aops
- anon_aops
- dax
是可以的,因为会自动 fallback 到 empty_aops 中了。
background 的行为到底在哪里? 在 page-writeback.c 还是 fs-writeback.c
应该是在 page-writeback.c 中,但是刷新的时候
-
balance_dirty_pages 会启动
-
wb_workfn 中是根据盘的来进行刷的,而不是经过文件系统的
- wb_start_background_writeback
还是分析下 patch 吧
patch 的修改方法,然后再去删除掉几个代码:
rg -l "\.dirty_folio\s+=\s+noop_dirty_folio" | xargs sed -i '/\.dirty_folio.*noop_dirty_folio/d'
-
fb_deferred_io_mmap 恐怕是不可以直接删除的
-
进一步的测试,dax 的代码使用 ext4 的
看来之前是犯过错误的: commit 0b78f8bcf495 (“Revert “fb_defio: Remove custom address_space_operations””)
Revert "fb_defio: Remove custom address_space_operations"
Commit ccf953d8f3d6 makes framebuffers which use deferred I/O stop
displaying updates after the first one. This is because the pages
handled by fb_defio no longer have a page_mapping(). That prevents
page_mkclean() from marking the PTEs as clean, and so writes are only
noticed the first time.
fb_defio: Remove custom address_space_operations
There's no need to give the page an address_space. Leaving the
page->mapping as NULL will cause the VM to handle set_page_dirty()
the same way that it's handled now, and that was the only reason to
set the address_space in the first place.
去掉了
- fb_open 中
buffer write 不能无限的产生 dirty page 的
sudo dd if=/dev/zero of=hugefile bs=100M
@[
balance_dirty_pages+1
balance_dirty_pages_ratelimited_flags+676
fault_dirty_shared_page+150
do_wp_page+299
__handle_mm_fault+2066
handle_mm_fault+341
do_user_addr_fault+515
exc_page_fault+109
asm_exc_page_fault+38
]: 1
@[
balance_dirty_pages+1
balance_dirty_pages_ratelimited_flags+676
generic_perform_write+338
ext4_buffered_write_iter+132
vfs_write+555
ksys_write+111
do_syscall_64+59
entry_SYSCALL_64_after_hwframe+114
]: 2
@[
balance_dirty_pages+1
balance_dirty_pages_ratelimited_flags+676
iomap_file_buffered_write+313
xfs_file_buffered_write+177
vfs_write+555
ksys_write+111
do_syscall_64+59
entry_SYSCALL_64_after_hwframe+114
]: 3333
从 PG_dirty 的角度分析法一下
调用主要是 buffer head 以及 iomap 的位置
基本执行路径 : 从 folio_mark_dirty 到 __folio_mark_dirty 的路径
-
folio_mark_dirty : 外部调用接口,通过 a_ops::dirty_folio 维持生活
- block_dirty_folio : buffer head 的用户注册 a_ops::dirty_folio 的 hook
- mark_buffer_dirty : buffer head
- filemap_dirty_folio : 非 buffer head 用户的 a_ops::dirty_folio 的
__folio_mark_dirty- folio_account_dirtied
- __xa_set_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_DIRTY); : 设置 aops 的 array
- folio_account_dirtied
其实这个问题挺复杂的
folio_mark_dirty 被唯一调用 mapping->a_ops->dirty_folio
之所以进行标记,是为了 writeback 知道哪些 page 需要写回
可以注册的 hook
- filemap_dirty_folio : Mark a folio dirty for filesystems which do not use buffer_heads.
主要做两个工作:
- __folio_mark_dirty : 给 as 标记
- __mark_inode_dirty : 给 fs 标记
bcachefs 就是使用的这个
xfs 在 filemap_dirty_folio 的基础上稍微的修改了一些,主要是 dirty 的粒度问题。
-
ext4_dirty_folio : buffer cache 的 old school 需要一些特殊的操作
-
noop_dirty_folio : 由于这个 page 不会写回,也只是设置一下 flag 即可。
如何知道那些 page 是 dirty 的
@[
writeback_iter+5
write_cache_pages+78
blkdev_writepages+86
do_writepages+118
__writeback_single_inode+61
writeback_sb_inodes+540
__writeback_inodes_wb+76
wb_writeback+403
wb_workfn+689
process_one_work+399
worker_thread+543
kthread+220
ret_from_fork+49
ret_from_fork_asm+26
]: 62
简单分析之后,可以看到是: filemap_get_folios_tag 来根据 PAGECACHE_TAG_DIRTY 来测试。
[ ] folio_mark_dirty 可以直接触发 wb 的 thread 吧
甚至不需要直接触发。
用 fio 测试一下看看主要的路径。
为什么 swap 是标记为 dirty 的
经典 backtrace
写完之后,所有的 page 都需要标记 clean 的,标记过程中,都是需要反向映射的:
@[
kvm_flush_tlb_multi+5
flush_tlb_mm_range+287
ptep_clear_flush+65
page_vma_mkclean_one+229
page_mkclean_one+142
rmap_walk_file+307
folio_mkclean+182
folio_clear_dirty_for_io+93
mpage_process_page_bufs+320
mpage_prepare_extent_to_map+796
ext4_do_writepages+883
ext4_writepages+281
do_writepages+243
__writeback_single_inode+62
writeback_sb_inodes+674
__writeback_inodes_wb+149
wb_writeback+375
wb_workfn+945
process_scheduled_works+444
worker_thread+712
kthread+248
ret_from_fork+55
ret_from_fork_asm+26
]: 3430640
本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。