Readahead

总体:

提供两个终极外部接口 : page_cache_sync_readahead 和 page_cache_async_readahead
read_pages 调用 : mapping->a_ops->readpages 或者 mapping->a_ops->readpage 实现真正读取工作

file_ra_state_init

// todo 实现简单，但是可以找到 ra 状态一下。

page_cache_sync_readahead 和 page_cache_async_readahead

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @filp: passed on to ->readpage() and ->readpages()
 * @offset: start offset into @mapping, in pagecache page-sized units
 * @req_size: hint: total size of the read which the caller is performing in
 *            pagecache pages
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
void page_cache_sync_readahead(struct address_space *mapping,
			       struct file_ra_state *ra, struct file *filp,
			       pgoff_t offset, unsigned long req_size)

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @filp: passed on to ->readpage() and ->readpages()
 * @page: the page at @offset which has the PG_readahead flag set
 * @offset: start offset into @mapping, in pagecache page-sized units
 * @req_size: hint: total size of the read which the caller is performing in
 *            pagecache pages
 *
 * page_cache_async_readahead() should be called when a page is used which
 * has the PG_readahead flag; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
void
page_cache_async_readahead(struct address_space *mapping,
			   struct file_ra_state *ra, struct file *filp,
			   struct page *page, pgoff_t offset,
			   unsigned long req_size)

实现的区别
使用上的区别

ondemand_readahead

page_cache_sync_readahead 和 page_cache_async_readahead 都会调用的

`__do_page_cache_readahead`

真正的读取工作

/*
 * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 *
 * Returns the number of pages requested, or the maximum amount of I/O allowed.
 */
unsigned int __do_page_cache_readahead(struct address_space *mapping,
		struct file *filp, pgoff_t offset, unsigned long nr_to_read,
		unsigned long lookahead_size)

// 三个 ref
/*
 * Submit IO for the read-ahead request in file_ra_state.
 */
static inline unsigned long ra_submit(struct file_ra_state *ra,
		struct address_space *mapping, struct file *filp)
{
	return __do_page_cache_readahead(mapping, filp,
					ra->start, ra->size, ra->async_size);
}

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static unsigned long
ondemand_readahead(struct address_space *mapping,
		   struct file_ra_state *ra, struct file *filp,
		   bool hit_readahead_marker, pgoff_t offset,
		   unsigned long req_size)

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
			       pgoff_t offset, unsigned long nr_to_read)

ksys_readahead

Man readahead(2) Man fadvise(2)

本以为 readahead 需要利用其中的函数，但是:

int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
	if (file->f_op->fadvise) // 至今，无人为此注册，所以等价于访问 generic_fadvise
		return file->f_op->fadvise(file, offset, len, advice);

	return generic_fadvise(file, offset, len, advice); // 由于 advice == POSIX_FADV_WILLNEED, 选择 generic_fadvise
}

int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
			       pgoff_t offset, unsigned long nr_to_read)

read_pages

你真的完全理解了吗 ?

参数 file * 从 generic_file_buffered_read 的位置发生了变化了吗 ?

1. blk_start_plug
2. list_del(&page->lru); // todo 到底其中 page->lru 的作用是什么 : 从slub的位置就看到了

才知道 readahead 会如此之激进啊

mmap 4G 的文件，但是所在的 cgroup 只有 2G，然后对于其进行顺序读，pgmafault 几乎没有什么增加的。

原来 readahead 机制一般都是自动计算的

fio mmap + randread

                        - 81.43% do_mem_abort
                           - 81.35% do_translation_fault
                              - 81.31% do_page_fault
                                 - 76.22% handle_mm_fault
                                    - 74.55% __handle_mm_fault
                                       - 72.89% do_fault
                                          - 69.29% __do_fault
                                             - 69.10% filemap_fault
                                                - 40.17% filemap_read_folio
                                                   - 33.92% ext4_read_folio
                                                      - 33.66% ext4_mpage_readpages
                                                         - 24.83% submit_bio
                                                            - submit_bio_noacct

有 readahead 的时候:

@[
    page_cache_ra_unbounded+0
    filemap_fault+1664
    __do_fault+68
    do_fault+936
    __handle_mm_fault+648
    handle_mm_fault+228
    do_page_fault+352
    do_translation_fault+180
    do_mem_abort+72
    el0_ia+132
    el0t_64_sync_handler+220
    el0t_64_sync+408
]: 1

没有 readahead 的时候

@[
    ext4_read_folio+0
    filemap_fault+1764
    __do_fault+68
    do_fault+936
    __handle_mm_fault+648
    handle_mm_fault+228
    do_page_fault+352
    do_translation_fault+180
    do_mem_abort+72
    el0_da+92
    el0t_64_sync_handler+196
    el0t_64_sync+408
]: 15783

可以通过这个方法来

diff --git a/mm/filemap.c b/mm/filemap.c
index a6459874bb2a..1e21b7d3f896 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3258,7 +3258,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
         * Do we miss much more than hit in this file? If so,
         * stop bothering with read-ahead. It will only hurt.
         */
-       if (mmap_miss > MMAP_LOTSAMISS)
+       if (mmap_miss > MMAP_LOTSAMISS || true)
                return fpin;

        /*

原来文件系统还会注册不同的 api

static int ext4_read_folio(struct file *file, struct folio *folio)
{
	int ret = -EAGAIN;
	struct inode *inode = folio->mapping->host;

	trace_ext4_read_folio(inode, folio);

	if (ext4_has_inline_data(inode))
		ret = ext4_readpage_inline(inode, folio);

	if (ret == -EAGAIN)
		return ext4_mpage_readpages(inode, NULL, folio);

	return ret;
}

static void ext4_readahead(struct readahead_control *rac)
{
	struct inode *inode = rac->mapping->host;

	/* If the file has inline data, no need to do readahead. */
	if (ext4_has_inline_data(inode))
		return;

	ext4_mpage_readpages(inode, rac, NULL);
}

readahead 只有 mmap 的时候才会用么?

如果使用 read 或者 write 读一个文件，也会有 readahead 机制么?

本站所有文章转发 CSDN 将按侵权追究法律责任，其它情况随意。