Skip to the content.

MMU Notifier

mmu_notifier

kvm_mmu_notifier_invalidate_range_start : 这是关键,end 几乎没有内容,invalidate_range 是给 mmu notifier 使用的.

kvm_mmu_notifier_invalidate_range_start ==> kvm_unmap_hva_range ==> kvm_handle_hva_range

static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
					   struct kvm_memory_slot *slot)
{
	unsigned long idx;

	idx = gfn_to_index(gfn, slot->base_gfn, level);
	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
}
static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
			 unsigned long data)
{
	u64 *sptep;
	struct rmap_iterator iter;
	int young = 0;

	for_each_rmap_spte(rmap_head, &iter, sptep)
		young |= mmu_spte_age(sptep);

	trace_kvm_age_page(gfn, level, slot, young);
	return young;
}

暂时的理解是 : host 告诉一个区域的 page 有问题了, 根据 HVA 可以知道 GPA 的物理地址出现问题, 那么需要知道找到对应的 shadow page table 进行处理.

由此可以想象 : 当 host 将 guest 的物理页面 swap 出去,由此可以 从而修改 shadow page table

kvm_arch_mmu_notifier_invalidate_range: 似乎是一个为 APIC 制作的

void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
					    unsigned long start, unsigned long end)
{
	unsigned long apic_address;

	/*
	 * The physical address of apic access page is stored in the VMCS.
	 * Update it when it becomes invalid.
	 */
	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
	if (start <= apic_address && apic_address < end)
		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}

对应的通知机制是是给 guest 发出通知消息

struct kvm_page_track_notifier_node {
	struct hlist_node node;

	/*
	 * It is called when guest is writing the write-tracked page
	 * and write emulation is finished at that time.
	 *
	 * @vcpu: the vcpu where the write access happened.
	 * @gpa: the physical address written by guest.
	 * @new: the data was written to the address.
	 * @bytes: the written length.
	 * @node: this node
	 */
	void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
			    int bytes, struct kvm_page_track_notifier_node *node);
	/*
	 * It is called when memory slot is being moved or removed
	 * users can drop write-protection for the pages in that memory slot
	 *
	 * @kvm: the kvm where memory slot being moved or removed
	 * @slot: the memory slot being moved or removed
	 * @node: this node
	 */
	void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
			    struct kvm_page_track_notifier_node *node);
};

https://lwn.net/Articles/266320/

Guest systems maintain their own page tables, but they are not the tables used by the memory management unit. Instead, whenever the guest makes a change to its tables, the host system intercepts the operation, checks it for validity, then mirrors the change in the real page tables, which “shadow” those maintained by the guest.

In particular, if the host system decides that it wants to push a given page out to swap, it can’t tell the guest that the page is no longer resident.

通过 notifier 告诉他的物理地址被换出了

https://lwn.net/Articles/732952/

check it /home/martins3/core/linux/Documentation/mm/mmu_notifier.rst

配合 mmu notifier 理解:

static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
	.clear_young		= kvm_mmu_notifier_clear_young,
	.test_young		= kvm_mmu_notifier_test_young,
	.change_pte		= kvm_mmu_notifier_change_pte,
	.release		= kvm_mmu_notifier_release,
};

mmu notifier

1 is worth reading !

Guest ram is mostly allocated by user process with memalign() and kvm get physical memory with get_user_pages.

The ‘MMU Notifier’ functionality can be also used by other subsystems like GRU and XPMEM to export the user virtual address space of computational tasks to other nodes

This will also allow KVM guest physical ram itself to be exported to other nodes through GRU and XPMEM or any other RDMA engine

———— function calling chain ————————– begin —

mmu_notifier_invalidate_range_start
  --> __mmu_notifier_invalidate_range_start
    --> mn_itree_invalidate
    --> mn_hlist_invalidate_range_start : call list one by one
mmu_notifier_invalidate_range_end
  --> __mmu_notifier_invalidate_range_end
    --> mn_itree_inv_end
    --> mn_hlist_invalidate_end
__mmu_notifier_register :
1. if mm->notifier_subscriptions is NULL, alloc and init one for it
2. is parameter subscription is not NULL, add it to mm->notifier_subscriptions list, mm->notifier_subscriptions->has_itree = true; otherwise
  mm_drop_all_locks(mm);

———— function calling chain ————————– begin —

———— critical struct ————————– begin —

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
  struct hlist_node hlist;
  const struct mmu_notifier_ops *ops;
  struct mm_struct *mm;
  struct rcu_head rcu;
  unsigned int users;
};

/**
 * struct mmu_interval_notifier_ops
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 */
struct mmu_interval_notifier_ops {
  bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
         const struct mmu_notifier_range *range,
         unsigned long cur_seq);
};

struct mmu_interval_notifier {
  struct interval_tree_node interval_tree;
  const struct mmu_interval_notifier_ops *ops;
  struct mm_struct *mm;
  struct hlist_node deferred_item;
  unsigned long invalidate_seq;
};


struct mmu_notifier_range {
  struct vm_area_struct *vma;
  struct mm_struct *mm;
  unsigned long start;
  unsigned long end;
  unsigned flags;
  enum mmu_notifier_event event;
  void *migrate_pgmap_owner;
};

/*
 * The mmu_notifier_subscriptions structure is allocated and installed in
 * mm->notifier_subscriptions inside the mm_take_all_locks() protected
 * critical section and it's released only when mm_count reaches zero
 * in mmdrop().
 */
struct mmu_notifier_subscriptions {
  /* all mmu notifiers registered in this mm are queued in this list */
  struct hlist_head list;
  bool has_itree;
  /* to serialize the list modifications and hlist_unhashed */
  spinlock_t lock;
  unsigned long invalidate_seq;
  unsigned long active_invalidate_ranges;
  struct rb_root_cached itree;
  wait_queue_head_t wq;
  struct hlist_head deferred_list;
};

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。

  1. lwn : Memory management notifiers