swapfile.c
The cornerstone of swap
area management is the swap_info array defined in mm/swapfile.c; its entries store information on the
individual swap areas in the system
问题
- swap_count_continued : 为什么 refcount 搞得这么复杂 ?
-
extent 和 cluster 机制如何融合起来 ?
- 既然存在多个 swap ,当 pageout() 的时候,如何选择的哪一个 swapfile 的
#define SWAP_CLUSTER_MAX 32UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */
- 18.3 实现似乎处理
有待分析的封装函数
/*
* out-of-line __page_file_ methods to avoid include hell.
*/
struct address_space *__page_file_mapping(struct page *page)
{
return page_swap_info(page)->swap_file->f_mapping;
}
try_to_free_swap
/*
* If swap is getting full, or if there are no more mappings of this page,
* then try_to_free_swap is called to free its swap space.
*/
int try_to_free_swap(struct page *page)
{
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!PageSwapCache(page))
return 0;
if (PageWriteback(page))
return 0;
if (page_swapped(page))
return 0;
/*
* Once hibernation has begun to create its image of memory,
* there's a danger that one of the calls to try_to_free_swap()
* - most probably a call from __try_to_reclaim_swap() while
* hibernation is allocating its own swap pages for the image,
* but conceivably even a call from memory reclaim - will free
* the swap from a page which has already been recorded in the
* image as a clean swapcache page, and then reuse its swap for
* another page of the image. On waking from hibernation, the
* original page might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
* Hibernation suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
return 0;
page = compound_head(page);
delete_from_swap_cache(page); // 将用于 swap 的 page 删除掉
SetPageDirty(page); // todo 为什么需要设置为 dirty ?
return 1;
}
static bool page_swapped(struct page *page) // 判断是否还有该 page 还有人依赖,我是怀疑,随时都是有人需要写会的
{
swp_entry_t entry;
struct swap_info_struct *si;
// THP_SWAP 居然是 TransParentHugePage swap 的意思
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) // 一般是非 huge
return page_swapcount(page) != 0;
page = compound_head(page);
entry.val = page_private(page);
si = _swap_info_get(entry);
if (si)
return swap_page_trans_huge_swapped(si, entry);
return false;
}
/*
* How many references to page are currently swapped out?
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
int page_swapcount(struct page *page) // 通过 swap_map 获取数值而已
{
int count = 0;
struct swap_info_struct *p;
struct swap_cluster_info *ci;
swp_entry_t entry;
unsigned long offset;
entry.val = page_private(page);
p = _swap_info_get(entry);
if (p) {
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
count = swap_count(p->swap_map[offset]);
unlock_cluster_or_swap_info(p, ci);
}
return count;
}
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
#ifdef CONFIG_SWAP
static __always_inline int PageSwapCache(struct page *page)
{
#ifdef CONFIG_THP_SWAP
page = compound_head(page);
#endif
return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags); // PG_swapcache 和 PG_swapbacked 的关系是什么 ?
}
swapoff : 对于 swapfile 终极理解(1)
- 这个东西为什么最终会调用 page_add_new_anon_rmap
- try_to_unuse 了解一下
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
unsigned char *swap_map;
struct swap_cluster_info *cluster_info;
unsigned long *frontswap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
int err, found = 0;
unsigned int old_block_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
BUG_ON(!current->mm);
pathname = getname(specialfile);
if (IS_ERR(pathname))
return PTR_ERR(pathname);
victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
err = PTR_ERR(victim);
if (IS_ERR(victim))
goto out;
mapping = victim->f_mapping;
spin_lock(&swap_lock);
plist_for_each_entry(p, &swap_active_head, list) {
if (p->flags & SWP_WRITEOK) {
if (p->swap_file->f_mapping == mapping) {
found = 1;
break;
}
}
}
if (!found) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
}
if (!security_vm_enough_memory_mm(current->mm, p->pages))
vm_unacct_memory(p->pages);
else {
err = -ENOMEM;
spin_unlock(&swap_lock);
goto out_dput;
}
del_from_avail_list(p);
spin_lock(&p->lock);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
plist_for_each_entry_continue(si, &swap_active_head, list) {
si->prio++;
si->list.prio--;
for_each_node(nid) {
if (si->avail_lists[nid].prio != 1)
si->avail_lists[nid].prio--;
}
}
least_priority++;
}
plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
disable_swap_slots_cache_lock();
set_current_oom_origin();
err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
if (err) {
/* re-insert swap space back into swap_list */
reinsert_swap_info(p);
reenable_swap_slots_cache_unlock();
goto out_dput;
}
reenable_swap_slots_cache_unlock();
spin_lock(&swap_lock);
spin_lock(&p->lock);
p->flags &= ~SWP_VALID; /* mark swap device as invalid */
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
/*
* wait for swap operations protected by get/put_swap_device()
* to complete
*/
synchronize_rcu();
flush_work(&p->discard_work);
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
spin_lock(&p->lock);
drain_mmlist();
/* wait for anyone still in scan_swap_map */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
schedule_timeout_uninterruptible(1);
spin_lock(&swap_lock);
spin_lock(&p->lock);
}
swap_file = p->swap_file;
old_block_size = p->old_block_size;
p->swap_file = NULL;
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
cluster_info = p->cluster_info;
p->cluster_info = NULL;
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
frontswap_invalidate_area(p->type);
frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
vfree(swap_map);
kvfree(cluster_info);
kvfree(frontswap_map);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
exit_swap_address_space(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
set_blocksize(bdev, old_block_size);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
inode_lock(inode);
inode->i_flags &= ~S_SWAPFILE;
inode_unlock(inode);
filp_close(swap_file, NULL);
/*
* Clear the SWP_USED flag after all resources are freed so that swapon
* can reuse this swap_info in alloc_swap_info() safely. It is ok to
* not hold p->lock after we cleared its SWP_WRITEOK.
*/
spin_lock(&swap_lock);
p->flags = 0;
spin_unlock(&swap_lock);
err = 0;
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
out_dput:
filp_close(victim, NULL);
out:
putname(pathname);
return err;
}
swapon : 对于 swapfile 终极理解(2)
- specialfile 是一个字符串,这里是文件还是 dev 有没有区分处理
- priority 体现在什么地方 ?
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
int prio;
int error;
union swap_header *swap_header;
int nr_extents;
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
struct swap_cluster_info *cluster_info = NULL;
unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
bool inced_nr_rotate_swap = false;
if (swap_flags & ~SWAP_FLAGS_VALID)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!swap_avail_heads)
return -ENOMEM;
// swap_info_struct 持有 swap cache 的基本信息
p = alloc_swap_info();
if (IS_ERR(p))
return PTR_ERR(p);
INIT_WORK(&p->discard_work, swap_discard_work);
// getname 将用户态的字符串"拷贝"到转化为内核态
name = getname(specialfile);
if (IS_ERR(name)) {
error = PTR_ERR(name);
name = NULL;
goto bad_swap;
}
// 可能是 name 表示可能是文件,但是也可能是设备
swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
if (IS_ERR(swap_file)) {
error = PTR_ERR(swap_file);
swap_file = NULL;
goto bad_swap;
}
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
inode = mapping->host;
// 设置 p->bdev,对于文件和 block dev 区别处理
/* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
error = claim_swapfile(p, inode);
if (unlikely(error))
goto bad_swap;
/*
* Read the swap header.
*/
if (!mapping->a_ops->readpage) {
error = -EINVAL;
goto bad_swap;
}
// 读取 sawp_header
page = read_mapping_page(mapping, 0, swap_file);
if (IS_ERR(page)) {
error = PTR_ERR(page);
goto bad_swap;
}
swap_header = kmap(page);
// 解析 swap_header
maxpages = read_swap_header(p, swap_header, inode);
if (unlikely(!maxpages)) {
error = -EINVAL;
goto bad_swap;
}
// 这就是那个用于存储每一个 page 的 counter 地方
/* OK, set up the swap map and apply the bad block list */
swap_map = vzalloc(maxpages);
if (!swap_map) {
error = -ENOMEM;
goto bad_swap;
}
// 权限检查
if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
p->flags |= SWP_STABLE_WRITES;
// 权限检查
if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
p->flags |= SWP_SYNCHRONOUS_IO;
// 初始化 cluster 相关的
// 一个 swapfile 会被拆分为 cluster
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
int cpu;
unsigned long ci, nr_cluster;
p->flags |= SWP_SOLIDSTATE;
/*
* select a random position to start with to help wear leveling
* SSD
*/
p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
GFP_KERNEL);
if (!cluster_info) {
error = -ENOMEM;
goto bad_swap;
}
for (ci = 0; ci < nr_cluster; ci++)
spin_lock_init(&((cluster_info + ci)->lock));
p->percpu_cluster = alloc_percpu(struct percpu_cluster);
if (!p->percpu_cluster) {
error = -ENOMEM;
goto bad_swap;
}
for_each_possible_cpu(cpu) {
struct percpu_cluster *cluster;
cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(&cluster->index);
}
} else {
atomic_inc(&nr_rotate_swap);
inced_nr_rotate_swap = true;
}
// cgroup 跳过
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
goto bad_swap;
// 初始化 cluster 和 extents
nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
cluster_info, maxpages, &span);
if (unlikely(nr_extents < 0)) {
error = nr_extents;
goto bad_swap;
}
/* frontswap enabled? set up bit-per-page map for frontswap */
if (IS_ENABLED(CONFIG_FRONTSWAP))
frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
sizeof(long),
GFP_KERNEL);
if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { // todo 所以 request queue 的 discard 是什么意思 ?
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
* order to sustain backward compatibility with older
* swapon(8) releases.
*/
p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
SWP_PAGE_DISCARD);
/*
* By flagging sys_swapon, a sysadmin can tell us to
* either do single-time area discards only, or to just
* perform discards for released swap page-clusters.
* Now it's time to adjust the p->flags accordingly.
*/
if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
p->flags &= ~SWP_PAGE_DISCARD;
else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
p->flags &= ~SWP_AREA_DISCARD;
/* issue a swapon-time discard if it's still required */
if (p->flags & SWP_AREA_DISCARD) {
int err = discard_swap(p);
if (unlikely(err))
pr_err("swapon: discard_swap(%p): %d\n",
p, err);
}
}
// 初始化 address_space
// p->type 就是编号
error = init_swap_address_space(p->type, maxpages);
if (error)
goto bad_swap;
/*
* Flush any pending IO and dirty mappings before we start using this
* swap device.
*/
inode->i_flags |= S_SWAPFILE;
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
goto bad_swap;
}
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
// 设置权限以及加入到 global list 中间
// 分析其中的 _enable_swap_info 和 setup_swap_info
enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "",
(p->flags & SWP_AREA_DISCARD) ? "s" : "",
(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
(frontswap_map) ? "FS" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
error = 0;
goto out;
bad_swap:
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
destroy_swap_extents(p);
swap_cgroup_swapoff(p->type);
spin_lock(&swap_lock);
p->swap_file = NULL;
p->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
kvfree(cluster_info);
kvfree(frontswap_map);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file) {
if (inode) {
inode_unlock(inode);
inode = NULL;
}
filp_close(swap_file, NULL);
}
out:
if (page && !IS_ERR(page)) {
kunmap(page);
put_page(page);
}
if (name)
putname(name);
if (inode)
inode_unlock(inode);
if (!error)
enable_swap_slots_cache();
return error;
}
/*
* Flush file data before changing attributes. Caller must hold any locks
* required to prevent further writes to this file until we're done setting
* flags.
*/
static inline int inode_drain_writes(struct inode *inode)
{
inode_dio_wait(inode);
return filemap_write_and_wait(inode->i_mapping);
}
try_to_unuse : 非常的长,也不知道在说什么 @todo
- 两次 while 循环
/*
* If the boolean frontswap is true, only unuse pages_to_unuse pages;
* pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
int try_to_unuse(unsigned int type, bool frontswap,
unsigned long pages_to_unuse)
{
struct mm_struct *prev_mm;
struct mm_struct *mm;
struct list_head *p;
int retval = 0;
struct swap_info_struct *si = swap_info[type];
struct page *page;
swp_entry_t entry;
unsigned int i;
if (!si->inuse_pages)
return 0;
if (!frontswap)
pages_to_unuse = 0;
retry:
retval = shmem_unuse(type, frontswap, &pages_to_unuse); // todo 为什么这么关键 ?
if (retval)
goto out;
prev_mm = &init_mm;
mmget(prev_mm);
spin_lock(&mmlist_lock);
p = &init_mm.mmlist;
while (si->inuse_pages &&
!signal_pending(current) &&
(p = p->next) != &init_mm.mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!mmget_not_zero(mm))
continue;
spin_unlock(&mmlist_lock);
mmput(prev_mm);
prev_mm = mm;
retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
if (retval) {
mmput(prev_mm);
goto out;
}
/*
* Make sure that we aren't completely killing
* interactive performance.
*/
cond_resched();
spin_lock(&mmlist_lock);
}
spin_unlock(&mmlist_lock);
mmput(prev_mm);
i = 0;
while (si->inuse_pages &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i, frontswap)) != 0) {
entry = swp_entry(type, i);
page = find_get_page(swap_address_space(entry), i);
if (!page)
continue;
/*
* It is conceivable that a racing task removed this page from
* swap cache just before we acquired the page lock. The page
* might even be back in swap cache on another swap area. But
* that is okay, try_to_free_swap() only removes stale pages.
*/
lock_page(page);
wait_on_page_writeback(page);
try_to_free_swap(page);
unlock_page(page);
put_page(page);
/*
* For frontswap, we just need to unuse pages_to_unuse, if
* it was specified. Need not check frontswap again here as
* we already zeroed out pages_to_unuse if not frontswap.
*/
if (pages_to_unuse && --pages_to_unuse == 0)
goto out;
}
/*
* Lets check again to see if there are still swap entries in the map.
* If yes, we would need to do retry the unuse logic again.
* Under global memory pressure, swap entries can be reinserted back
* into process space after the mmlist loop above passes over them.
*
* Limit the number of retries? No: when mmget_not_zero() above fails,
* that mm is likely to be freeing swap from exit_mmap(), which proceeds
* at its own independent pace; and even shmem_writepage() could have
* been preempted after get_swap_page(), temporarily hiding that swap.
* It's easy and robust (though cpu-intensive) just to keep retrying.
*/
if (si->inuse_pages) {
if (!signal_pending(current))
goto retry;
retval = -EINTR;
}
out:
return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
}
static int unuse_mm(struct mm_struct *mm, unsigned int type,
bool frontswap, unsigned long *fs_pages_to_unuse) // 为什么参数会有 mm_struct
{
struct vm_area_struct *vma;
int ret = 0;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma) { // 看到没有,只有 anon,
ret = unuse_vma(vma, type, frontswap,
fs_pages_to_unuse);
if (ret)
break;
}
cond_resched();
}
up_read(&mm->mmap_sem);
return ret;
}
static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
bool frontswap, unsigned long *fs_pages_to_unuse)
{
pgd_t *pgd;
unsigned long addr, end, next;
int ret;
addr = vma->vm_start;
end = vma->vm_end;
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
ret = unuse_p4d_range(vma, pgd, addr, next, type, // 然后递归向上
frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
mmget
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
*
* Make sure that the address space of the given &struct mm_struct doesn't
* go away. This does not protect against parts of the address space being
* modified or freed, however.
*
* Never use this function to pin this address space for an
* unbounded/indefinite amount of time.
*
* Use mmput() to release the reference acquired by mmget().
*
* See also <Documentation/vm/active_mm.rst> for an in-depth explanation
* of &mm_struct.mm_count vs &mm_struct.mm_users.
*/
static inline void mmget(struct mm_struct *mm)
{
atomic_inc(&mm->mm_users);
}
static inline bool mmget_not_zero(struct mm_struct *mm)
{
return atomic_inc_not_zero(&mm->mm_users);
}
各种 swap page count
- 四个函数没有实质区别,都是返回 swap_map 数组上的数值
/*
* How many references to page are currently swapped out?
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
int page_swapcount(struct page *page)
{
int count = 0;
struct swap_info_struct *p;
struct swap_cluster_info *ci;
swp_entry_t entry;
unsigned long offset;
entry.val = page_private(page);
p = _swap_info_get(entry);
if (p) {
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
count = swap_count(p->swap_map[offset]);
unlock_cluster_or_swap_info(p, ci);
}
return count;
}
int __swap_count(swp_entry_t entry)
{
struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
int count = 0;
si = get_swap_device(entry);
if (si) {
count = swap_count(si->swap_map[offset]);
put_swap_device(si);
}
return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
int count = 0;
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
ci = lock_cluster_or_swap_info(si, offset);
count = swap_count(si->swap_map[offset]);
unlock_cluster_or_swap_info(si, ci);
return count;
}
/*
* How many references to @entry are currently swapped out?
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
int __swp_swapcount(swp_entry_t entry)
{
int count = 0;
struct swap_info_struct *si;
si = get_swap_device(entry);
if (si) {
count = swap_swapcount(si, entry);
put_swap_device(si);
}
return count;
}
/*
* How many references to @entry are currently swapped out?
* This considers COUNT_CONTINUED so it returns exact answer.
*/
int swp_swapcount(swp_entry_t entry)
{
int count, tmp_count, n;
struct swap_info_struct *p;
struct swap_cluster_info *ci;
struct page *page;
pgoff_t offset;
unsigned char *map;
p = _swap_info_get(entry);
if (!p)
return 0;
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
count = swap_count(p->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
goto out;
count &= ~COUNT_CONTINUED;
n = SWAP_MAP_MAX + 1;
page = vmalloc_to_page(p->swap_map + offset);
offset &= ~PAGE_MASK;
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
do {
page = list_next_entry(page, lru);
map = kmap_atomic(page);
tmp_count = map[offset];
kunmap_atomic(map);
count += (tmp_count & ~COUNT_CONTINUED) * n;
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
unlock_cluster_or_swap_info(p, ci);
return count;
swap_free
/*
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
*/
void swap_free(swp_entry_t entry)
{
struct swap_info_struct *p;
p = _swap_info_get(entry);
if (p) // 当 usage count 为 0 的时候释放,每次 pgfault 的时候处理一下
__swap_entry_free(p, entry, 1);
}
-
进行一些释放工作 ```c static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset); usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); // todo swap_slots.c 中间的内容,不会太复杂吧!
return usage; }
static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache;
count = p->swap_map[offset];
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
if (usage == SWAP_HAS_CACHE) {
VM_BUG_ON(!has_cache);
has_cache = 0;
} else if (count == SWAP_MAP_SHMEM) {
/*
* Or we could insist on shmem.c using a special
* swap_shmem_free() and free_shmem_swap_and_cache()...
*/
count = 0;
} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
if (count == COUNT_CONTINUED) {
if (swap_count_continued(p, offset, count)) // todo 这个东西有点复杂
count = SWAP_MAP_MAX | COUNT_CONTINUED;
else
count = SWAP_MAP_MAX;
} else
count--;
}
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
return usage; }
2. 判断是否真的存在
```c
static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
p = __swap_info_get(entry);
if (!p)
goto out;
if (!p->swap_map[swp_offset(entry)])
goto bad_free;
return p;
bad_free:
pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
goto out;
out:
return NULL;
}
static struct swap_info_struct *__swap_info_get(swp_entry_t entry) // 获取一个 swap_info_struct 而已,搞得这么麻烦!
{
struct swap_info_struct *p;
unsigned long offset;
if (!entry.val)
goto out;
p = swp_swap_info(entry);
if (!p)
goto bad_nofile;
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
if (offset >= p->max)
goto bad_offset;
return p;
bad_offset:
pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
goto out;
bad_device:
pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
goto out;
bad_nofile:
pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
out:
return NULL;
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
}
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
if (type >= READ_ONCE(nr_swapfiles))
return NULL;
smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
return READ_ONCE(swap_info[type]);
}
/*
* Extract the `type' field from a swp_entry_t. The swp_entry_t is in
* arch-independent format
*/
static inline unsigned swp_type(swp_entry_t entry)
{
return (entry.val >> SWP_TYPE_SHIFT);
}
struct swap_info_struct *swap_info[MAX_SWAPFILES];
/*
* Extract the `offset' field from a swp_entry_t. The swp_entry_t is in
* arch-independent format
*/
static inline pgoff_t swp_offset(swp_entry_t entry)
{
return entry.val & SWP_OFFSET_MASK;
}
/*
* swapcache pages are stored in the swapper_space radix tree. We want to
* get good packing density in that tree, so the index should be dense in
* the low-order bits.
*
* We arrange the `type' and `offset' fields so that `type' is at the seven
* high-order bits of the swp_entry_t and `offset' is right-aligned in the
* remaining bits. Although `type' itself needs only five bits, we allow for
* shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry().
*
* swp_entry_t's are *never* stored anywhere in their arch-dependent format.
*/
#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
// 非常的人性化,前面存储 offset ,后面存储 type
reuse_swap_page : gp fault 辅助函数,reuse 没有 references 的 page
/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
*
* NOTE: total_map_swapcount should not be relied upon by the caller if
* reuse_swap_page() returns false, but it may be always overwritten
* (see the other implementation for CONFIG_SWAP=n).
*/
bool reuse_swap_page(struct page *page, int *total_map_swapcount)
{
int count, total_mapcount, total_swapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return false;
count = page_trans_huge_map_swapcount(page, &total_mapcount,
&total_swapcount);
if (total_map_swapcount)
*total_map_swapcount = total_mapcount + total_swapcount;
if (count == 1 && PageSwapCache(page) &&
(likely(!PageTransCompound(page)) ||
/* The remaining swap count will be freed soon */
total_swapcount == page_swapcount(page))) {
if (!PageWriteback(page)) {
page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
} else {
swp_entry_t entry;
struct swap_info_struct *p;
entry.val = page_private(page);
p = swap_info_get(entry);
if (p->flags & SWP_STABLE_WRITES) {
spin_unlock(&p->lock);
return false;
}
spin_unlock(&p->lock);
}
}
return count <= 1;
}
setup_swap_map_and_extents
static int setup_swap_map_and_extents(struct swap_info_struct *p,
union swap_header *swap_header, // swapfile 中间读入的基本信息
unsigned char *swap_map, // swap_map = vzalloc(maxpages)
struct swap_cluster_info *cluster_info, // 上层分配的一组 swap_cluster_info,其中的 lock 被初始化,另外的两个变量需要被初始化
unsigned long maxpages, // 从 swap_header 中间读取的,反映了容量的大小
sector_t *span) // 返回值
{
unsigned int j, k;
unsigned int nr_good_pages;
int nr_extents;
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; // p->cluster_next 被初始化为随机数值
unsigned long i, idx;
nr_good_pages = maxpages - 1; /* omit header page */
cluster_list_init(&p->free_clusters);
cluster_list_init(&p->discard_clusters);
// 将 badpage 标记为被使用
// cluster 持有了该 cluster 中间一共被使用的 page 数量
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
return -EINVAL;
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
nr_good_pages--;
/*
* Haven't marked the cluster free yet, no list
* operation involved
*/
inc_cluster_info_page(p, cluster_info, page_nr);
}
}
// 将由于多余的 page 清理掉
/* Haven't marked the cluster free yet, no list operation involved */
for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
inc_cluster_info_page(p, cluster_info, i);
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
/*
* Not mark the cluster free yet, no list
* operation involved
*/
// 第一个 page 应该是是用于存储
inc_cluster_info_page(p, cluster_info, 0);
p->max = maxpages;
p->pages = nr_good_pages;
// 初始化 extents
nr_extents = setup_swap_extents(p, span);
if (nr_extents < 0)
return nr_extents;
nr_good_pages = p->pages;
}
if (!nr_good_pages) {
pr_warn("Empty swap-file\n");
return -EINVAL;
}
if (!cluster_info)
return nr_extents;
// 其实方法是 : free_clusters 中间的 cluster_info 排布方式 :
// 不同组(放到 cache line 的会放到一起的)将会连续排练,如果是顺序访问,
// 那么就不会互相 cache line 冲突了
/*
* Reduce false cache line sharing between cluster_info and
* sharing same address space.
*/
for (k = 0; k < SWAP_CLUSTER_COLS; k++) { // 单个
j = (k + col) % SWAP_CLUSTER_COLS;
for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { // 所有的 cluster
idx = i * SWAP_CLUSTER_COLS + j;
if (idx >= nr_clusters) // idx 越界
continue;
if (cluster_count(&cluster_info[idx])) // 不是 free 的
continue;
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); // 其余全部初始化为 free
cluster_list_add_tail(&p->free_clusters, cluster_info, // 以及加入到链表后面
idx);
}
}
return nr_extents;
}
// 一个 L1_CACHE_BYTES 存储的 swap_cluster_info 的个数
// 一个 address_space 存储的 cluster 的数量
#define SWAP_CLUSTER_INFO_COLS \
DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
#define SWAP_CLUSTER_SPACE_COLS \
DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
#define SWAP_CLUSTER_COLS \
max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
static void cluster_list_init(struct swap_cluster_list *list)
{
cluster_set_null(&list->head);
cluster_set_null(&list->tail);
}
static inline void cluster_set_null(struct swap_cluster_info *info)
{
info->flags = CLUSTER_FLAG_NEXT_NULL;
info->data = 0;
}
#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
struct swap_cluster_list {
struct swap_cluster_info head;
struct swap_cluster_info tail;
};
/*
* We use this to track usage of a cluster. A cluster is a block of swap disk
* space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
* free clusters are organized into a list. We fetch an entry from the list to
* get a free cluster.
*
* The data field stores next cluster if the cluster is free or cluster usage
* counter otherwise. The flags field determines if a cluster is free. This is
* protected by swap_info_struct.lock.
*/
struct swap_cluster_info {
spinlock_t lock; /*
* Protect swap_cluster_info fields
* and swap_info_struct->swap_map
* elements correspond to the swap
* cluster
*/
unsigned int data:24; // 当存储数据的时候,表示其中一共多少空余,否则可能当做next使用
unsigned int flags:8;
};
/*
* The cluster corresponding to page_nr will be used. The cluster will be
* removed from free cluster list and its usage counter will be increased.
*/
static void inc_cluster_info_page(struct swap_info_struct *p,
struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
unsigned long idx = page_nr / SWAPFILE_CLUSTER; // 获取 page_nr 所在的 cluster_info 所在的编号
if (!cluster_info)
return;
if (cluster_is_free(&cluster_info[idx]))
alloc_cluster(p, idx);
VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
cluster_set_count(&cluster_info[idx],
cluster_count(&cluster_info[idx]) + 1); // 终于,完成其工作
}
static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) // idx 是 cluster 的地址
{
struct swap_cluster_info *ci = si->cluster_info;
// si->free_clusters 应该是所有的完全的 free_clusters 吧!
VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); // alloc 的内容总是需要从 si->free_clusters 这里
cluster_list_del_first(&si->free_clusters, ci);
cluster_set_count_flag(ci + idx, 0, 0);
}
static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
{
return cluster_next(&list->head);
}
static inline unsigned int cluster_next(struct swap_cluster_info *info)
{
return info->data;
}
static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
struct swap_cluster_info *ci)
{
unsigned int idx;
idx = cluster_next(&list->head);
if (cluster_next(&list->tail) == idx) { // 这个已释放,
cluster_set_null(&list->head);
cluster_set_null(&list->tail);
} else
cluster_set_next_flag(&list->head,
cluster_next(&ci[idx]), 0); // 将 head 指向 ci 的 next
return idx;
}
setup_swap_extents : 彻底理解 extents 机制
- 所以,extents 是如何被使用的 ?
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
* onto a contiguous range of disk blocks. An ordered list of swap extents
* is built at swapon time and is then used at swap_writepage/swap_readpage
* time for locating where on disk a page belongs.
*
* If the swapfile is an S_ISBLK block device, a single extent is installed.
* This is done so that the main operating code can treat S_ISBLK and S_ISREG
* swap files identically.
*
* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
* extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
* swapfiles are handled *identically* after swapon time.
*
* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
* and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
* some stray blocks are found which do not fall within the PAGE_SIZE alignment
* requirements, they are simply tossed out - we will never use those blocks
* for swapping.
*
* For all swap devices we set S_SWAPFILE across the life of the swapon. This
* prevents users from writing to the swap device, which will corrupt memory.
*
* The amount of disk space which a single swap extent represents varies.
* Typically it is in the 1-4 megabyte range. So we can have hundreds of
* extents in the list. To avoid much list walking, we cache the previous
* search location in `curr_swap_extent', and start new searches from there.
* This is extremely effective. The average number of iterations in
* map_swap_page() has been measured at about 0.3 per page. - akpm.
*/
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) // 这个注释的最后一段有点问题啊 ! 根本没有 curr_swap_extent 这个东西,而且什么叫做
{
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
int ret;
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0); // 对于 block 而言,初始化的时候插入一个节点即可
*span = sis->pages; // 所有的全部可以利用起来
return ret;
}
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
if (ret >= 0)
sis->flags |= SWP_ACTIVATED;
if (!ret) {
sis->flags |= SWP_FS;
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
}
return ret;
}
return generic_swapfile_activate(sis, swap_file, span);
}
swapfile
// 管理其中的结构,关键机制在 cluster 和 extents 即可,基本的 IO 交给下层的 file,所以 swapfile 的功能和 ext2 的功能一致,负责下层的磁盘的布局
- delete_from_swap_cache => put_swap_page => ??
- try_to_free_swap
// 机制
- cluster
- extents
- 全局的 swap_active_head
- avail_lists : 为什么需要给每一个 node 提供孤儿
```c
/*
- all active swap_info_structs
- protected with swap_lock, and ordered by priority. */ PLIST_HEAD(swap_active_head); // TODO 是不是首先按照 swap_info_struct,然后按照 cluster
static void __del_from_avail_list(struct swap_info_struct *p) { int nid;
for_each_node(nid) plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); }
static void del_from_avail_list(struct swap_info_struct *p) { spin_lock(&swap_avail_lock); __del_from_avail_list(p); spin_unlock(&swap_avail_lock); }
结构:
1. 一个 swapfile 对应 swap_info_struct
2. 一个 swapfile 对应多个 cluster,并且使用 cluster_info 描述
回答问题:
0. 当一个文件被设置为 swapfile 的时候,如何阻止被访问。
1. 这几个函数看似都是 free,各自的作用是什么 ?
```c
/*
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
*/
void swap_free(swp_entry_t entry)
/*
* If swap is getting full, or if there are no more mappings of this page,
* then try_to_free_swap is called to free its swap space.
*/
int try_to_free_swap(struct page *page)
// TODO 很奇怪,swap 机制为什么和 vma 联系到一起了,这不是曾经的反向映射
/*
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
*
* if the boolean frontswap is true, only unuse pages_to_unuse pages;
* pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
int try_to_unuse(unsigned int type, bool frontswap,
unsigned long pages_to_unuse)
static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
从 shmem_swapin_page 的内容看, 首先调用 delete_from_swap_cache,然后调用 swap_free,前者应该是处理 swap cache 的 radix tree 维护,后者处理 swap slot 的问题。
关键函数分析,这两个函数到时候看书(ULK) 进行补充一下
- swapon : 似乎不难,处理各种机制的建立过程
- swapoff : 如果彻底理解 swapon,那么不难,关键 : try_to_unuse 调用两个函数
- shmem_unuse
- unuse_mm : 逐个清理
si_swapinfo
- si_swapinfo 比想象的复杂,是因为 swapfile 导致的
si_swapinfo 是如何被获取的
似乎是,所有的 page 首先加入到 swap cache 中,再 swap out 的,例如:
shmem_writepage 首先调用 add_to_swap_cache -> folio_alloc_swap 这其实是一个从 bdev 中获取空间
然后去调用 swap_writepage 将 page 换出的。
swp_swap_info(folio->swap)
ublk 是配置容量的吗?
所以 priority 的使用是在 folio_alloc_swap 中处理的
似乎我们对于 SWP_FS_OPS 的理解有错误
本来以为是任何文件系统都是注册过,但是只有 nfs_swap_activate 和 smaba 才注册过这个:
sis->flags |= SWP_FS_OPS;
本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。