Skip to the content.

vmx

L0 的模拟 nested 和 L1 以及以上的版本模拟 nested 有区别吗?

并不是一样的,在 L1 中可以观测到非常频繁的 handle_vmread

sudo bpftrace -e "kprobe:handle_vmread { @[kstack] = count(); }"

但是在 L1 中完全观测不到:

此外,L3 虚拟机的出现,让 handle_vmread 的数量差别非常大:

🧀  sudo bpftrace -e "kprobe:handle_vmread { @[kstack] = count(); }"

Attaching 1 probe...
^C

@[
    handle_vmread+5
    vmx_handle_exit+301
    kvm_arch_vcpu_ioctl_run+1718
    kvm_vcpu_ioctl+587
    __x64_sys_ioctl+148
    do_syscall_64+197
    entry_SYSCALL_64_after_hwframe+111
]: 37375
linux on  master [+?] via 🐍 v3.11.8 via ❄️  impure (yyds-env) 13900k took 5s
🧀  sudo bpftrace -e "kprobe:handle_vmread { @[kstack] = count(); }"

Attaching 1 probe...
^C

@[
    handle_vmread+5
    vmx_handle_exit+301
    kvm_arch_vcpu_ioctl_run+1718
    kvm_vcpu_ioctl+587
    __x64_sys_ioctl+148
    do_syscall_64+197
    entry_SYSCALL_64_after_hwframe+111
]: 1333

如何处理更加高层次的 ept

sudo bpftrace -e "kprobe:kvm_tdp_page_fault { @[kstack] = count(); }"
sudo perf ftrace -G kvm_tdp_page_fault -g 'smp_*' -g irq_enter_rcu -g __sysvec_irq_work -g irq_exit_rcu | tee a

L1 的流程图:

# tracer: function_graph
#
# CPU  DURATION                  FUNCTION CALLS
# |     |   |                     |   |   |   |
 3)               |  kvm_tdp_page_fault [kvm]() {
 3)   0.145 us    |    kvm_arch_has_noncoherent_dma [kvm]();
 3)   0.112 us    |    fast_page_fault [kvm]();
 3)               |    mmu_topup_memory_caches [kvm]() {
 3)               |      kvm_mmu_topup_memory_cache [kvm]() {
 3)   0.116 us    |        __kvm_mmu_topup_memory_cache [kvm]();
 3)   0.305 us    |      }
 3)               |      kvm_mmu_topup_memory_cache [kvm]() {
 3)   0.085 us    |        __kvm_mmu_topup_memory_cache [kvm]();
 3)   0.256 us    |      }
 3)               |      kvm_mmu_topup_memory_cache [kvm]() {
 3)   0.088 us    |        __kvm_mmu_topup_memory_cache [kvm]();
 3)   0.252 us    |      }
 3)   1.208 us    |    }
 3)               |    kvm_faultin_pfn [kvm]() {
 3)               |      __gfn_to_pfn_memslot [kvm]() {
 3)               |        hva_to_pfn [kvm]() {
 3)               |          get_user_pages_fast_only() {
 3)   0.101 us    |            is_valid_gup_args();
 3)               |            internal_get_user_pages_fast() {
 3)   0.116 us    |              pud_huge();
 3)   0.091 us    |              pmd_huge();
 3)               |              __pte_offset_map() {
 3)   0.090 us    |                __rcu_read_lock();
 3)   0.265 us    |              }
 3)   0.251 us    |              try_grab_folio();
 3)   0.091 us    |              folio_fast_pin_allowed();
 3)   0.096 us    |              __rcu_read_unlock();
 3)   2.649 us    |            }
 3)   3.061 us    |          }
 3)   3.313 us    |        }
 3)   3.528 us    |      }
 3)   3.795 us    |    }
 3)   0.115 us    |    _raw_read_lock();
 3)   0.335 us    |    is_page_fault_stale [kvm]();
 3)               |    kvm_tdp_mmu_map [kvm]() {
 3)               |      kvm_mmu_hugepage_adjust [kvm]() {
 3)   0.596 us    |        kvm_mmu_max_mapping_level [kvm]();
 3)   1.284 us    |      }
 3)   0.090 us    |      __rcu_read_lock();
 3)               |      tdp_iter_start [kvm]() {
 3)   0.306 us    |        tdp_iter_restart [kvm]();
 3)   0.533 us    |      }
 3)   0.143 us    |      tdp_iter_next [kvm]();
 3)   0.344 us    |      tdp_iter_next [kvm]();
 3)   0.181 us    |      tdp_iter_next [kvm]();
 3)               |      make_spte [kvm]() {
 3)   0.314 us    |        kvm_is_mmio_pfn [kvm]();
 3)               |        vmx_get_mt_mask [kvm_intel]() {
 3)   0.094 us    |          kvm_arch_has_noncoherent_dma [kvm]();
 3)   0.289 us    |        }
 3)               |        mmu_try_to_unsync_pages [kvm]() {
 3)   0.245 us    |          kvm_gfn_is_write_tracked [kvm]();
 3)   0.555 us    |        }
 3)   1.700 us    |      }
 3)   0.243 us    |      handle_changed_spte [kvm]();
 3)   0.109 us    |      __rcu_read_unlock();
 3) + 11.138 us   |    }
 3)   0.106 us    |    _raw_read_unlock();
 3)               |    kvm_release_pfn_clean [kvm]() {
 3)   0.109 us    |      kvm_pfn_to_refcounted_page [kvm]();
 3)               |      kvm_release_page_clean [kvm]() {
 3)               |        mark_page_accessed() {
 3)   0.098 us    |          folio_mark_accessed();
 3)   0.280 us    |        }
 3)   0.522 us    |      }
 3)   0.937 us    |    }
 3) + 24.222 us   |  }
 ------------------------------------------
 3)  qemu-sy-6792  =>  qemu-sy-6789
 ------------------------------------------

 3)               |  kvm_tdp_page_fault [kvm]() {
 3)   0.173 us    |    kvm_arch_has_noncoherent_dma [kvm]();
 3)   0.118 us    |    fast_page_fault [kvm]();
 3)               |    mmu_topup_memory_caches [kvm]() {
 3)               |      kvm_mmu_topup_memory_cache [kvm]() {
 3)   0.116 us    |        __kvm_mmu_topup_memory_cache [kvm]();
 3)   0.355 us    |      }
 3)               |      kvm_mmu_topup_memory_cache [kvm]() {
 3)   0.085 us    |        __kvm_mmu_topup_memory_cache [kvm]();
 3)   0.249 us    |      }
 3)               |      kvm_mmu_topup_memory_cache [kvm]() {
 3)   0.090 us    |        __kvm_mmu_topup_memory_cache [kvm]();
 3)   0.252 us    |      }
 3)   1.254 us    |    }
 3)               |    kvm_faultin_pfn [kvm]() {
 3)               |      __gfn_to_pfn_memslot [kvm]() {
 3)               |        hva_to_pfn [kvm]() {
 3)               |          get_user_pages_fast_only() {
 3)   0.098 us    |            is_valid_gup_args();
 3)               |            internal_get_user_pages_fast() {
 3)   0.145 us    |              pud_huge();
 3)   0.096 us    |              pmd_huge();
 3)               |              __pte_offset_map() {
 3)   0.089 us    |                __rcu_read_lock();
 3)   0.260 us    |              }
 3)   0.368 us    |              try_grab_folio();
 3)   0.091 us    |              folio_fast_pin_allowed();
 3)   0.103 us    |              __rcu_read_unlock();
 3)   3.795 us    |            }
 3)   4.229 us    |          }
 3)   4.485 us    |        }
 3)   4.660 us    |      }
 3)   5.065 us    |    }
 3)   0.148 us    |    _raw_read_lock();
 3)   0.433 us    |    is_page_fault_stale [kvm]();
 3)               |    kvm_tdp_mmu_map [kvm]() {
 3)               |      kvm_mmu_hugepage_adjust [kvm]() {
 3)   0.466 us    |        kvm_mmu_max_mapping_level [kvm]();
 3)   0.716 us    |      }
 3)   0.091 us    |      __rcu_read_lock();
 3)               |      tdp_iter_start [kvm]() {
 3)   0.366 us    |        tdp_iter_restart [kvm]();
 3)   0.588 us    |      }
 3)   0.154 us    |      tdp_iter_next [kvm]();
 3)   0.226 us    |      tdp_iter_next [kvm]();
 3)   0.201 us    |      tdp_iter_next [kvm]();
 3)               |      make_spte [kvm]() {
 3)   0.315 us    |        kvm_is_mmio_pfn [kvm]();
 3)               |        vmx_get_mt_mask [kvm_intel]() {
 3)   0.090 us    |          kvm_arch_has_noncoherent_dma [kvm]();
 3)   0.270 us    |        }
 3)               |        mmu_try_to_unsync_pages [kvm]() {
 3)   0.296 us    |          kvm_gfn_is_write_tracked [kvm]();
 3)   0.693 us    |        }
 3)   1.858 us    |      }
 3)   0.302 us    |      handle_changed_spte [kvm]();
 3)   0.121 us    |      __rcu_read_unlock();
 3)   9.057 us    |    }
 3)   0.110 us    |    _raw_read_unlock();
 3)               |    kvm_release_pfn_clean [kvm]() {
 3)   0.105 us    |      kvm_pfn_to_refcounted_page [kvm]();
 3)               |      kvm_release_page_clean [kvm]() {
 3)               |        mark_page_accessed() {
 3)   0.094 us    |          folio_mark_accessed();
 3)   0.326 us    |        }
 3)   0.564 us    |      }
 3)   0.963 us    |    }
 3) + 25.122 us   |  }
 0)               |    kvm_tdp_mmu_map [kvm]() {
 0)               |      kvm_mmu_hugepage_adjust [kvm]() {
 0)   0.630 us    |        kvm_mmu_max_mapping_level [kvm]();
 0)   0.795 us    |      }
 0)   0.088 us    |      __rcu_read_lock();
 0)               |      tdp_iter_start [kvm]() {
 0)   0.099 us    |        tdp_iter_restart [kvm]();
 0)   0.258 us    |      }
 0)   0.103 us    |      tdp_iter_next [kvm]();
 0)   0.096 us    |      kvm_mmu_memory_cache_alloc [kvm]();
 0)   0.082 us    |      kvm_mmu_memory_cache_alloc [kvm]();
 0)               |      tdp_mmu_init_child_sp [kvm]() {
 0)   0.083 us    |        tdp_mmu_init_sp [kvm]();
 0)   0.233 us    |      }
 0)               |      tdp_mmu_link_sp [kvm]() {
 0)   0.090 us    |        make_nonleaf_spte [kvm]();
 0)   0.113 us    |        handle_changed_spte [kvm]();
 0)               |        __mod_lruvec_page_state() {
 0)   0.089 us    |          __rcu_read_lock();
 0)               |          __mod_lruvec_state() {
 0)   0.086 us    |            __mod_node_page_state();
 0)               |            __mod_memcg_lruvec_state() {
 0)   0.103 us    |              cgroup_rstat_updated();
 0)   0.286 us    |            }
 0)   0.612 us    |          }
 0)   0.097 us    |          __rcu_read_unlock();
 0)   1.182 us    |        }
 0)   3.059 us    |      }
 0)   0.167 us    |      tdp_iter_next [kvm]();
 0)   0.084 us    |      kvm_mmu_memory_cache_alloc [kvm]();
 0)   0.085 us    |      kvm_mmu_memory_cache_alloc [kvm]();
 0)               |      tdp_mmu_init_child_sp [kvm]() {
 0)   0.084 us    |        tdp_mmu_init_sp [kvm]();
 0)   0.233 us    |      }
 0)               |      tdp_mmu_link_sp [kvm]() {
 0)   0.086 us    |        make_nonleaf_spte [kvm]();
 0)   0.109 us    |        handle_changed_spte [kvm]();
 0)               |        __mod_lruvec_page_state() {
 0)   0.083 us    |          __rcu_read_lock();
 0)               |          __mod_lruvec_state() {
 0)   0.084 us    |            __mod_node_page_state();
 0)               |            __mod_memcg_lruvec_state() {
 0)   0.083 us    |              cgroup_rstat_updated();
 0)   0.241 us    |            }
 0)   0.554 us    |          }
 0)   0.086 us    |          __rcu_read_unlock();
 0)   1.025 us    |        }
 0)   2.329 us    |      }
 0)   0.112 us    |      tdp_iter_next [kvm]();
 0)               |      make_spte [kvm]() {
 0)   0.104 us    |        kvm_is_mmio_pfn [kvm]();
 0)               |        vmx_get_mt_mask [kvm_intel]() {
 0)   0.094 us    |          kvm_arch_has_noncoherent_dma [kvm]();
 0)   0.250 us    |        }
 0)   0.609 us    |      }
 0)   0.146 us    |      handle_changed_spte [kvm]();
 0)   0.105 us    |      __rcu_read_unlock();
 0) + 11.053 us   |    }
 0)   0.090 us    |    _raw_read_unlock();
 0)               |    kvm_release_pfn_clean [kvm]() {
 0)   0.088 us    |      kvm_pfn_to_refcounted_page [kvm]();
 0)               |      kvm_release_page_clean [kvm]() {
 0)               |        mark_page_accessed() {
 0)   0.087 us    |          folio_mark_accessed();
 0)   0.249 us    |        }
 0)   0.408 us    |      }
 0)   0.727 us    |    }
 0) + 19.535 us   |  }

L0 的流程图

# tracer: function_graph
#
# CPU  DURATION                  FUNCTION CALLS
# |     |   |                     |   |   |   |
  2)               |  kvm_tdp_page_fault [kvm]() {
  2)   0.122 us    |    kvm_arch_has_noncoherent_dma [kvm]();
  2)   0.225 us    |    kvm_gfn_is_write_tracked [kvm]();
  2)               |    fast_page_fault [kvm]() {
  2)               |      walk_shadow_page_lockless_begin [kvm]() {
  2)   0.062 us    |        __rcu_read_lock();
  2)   0.207 us    |      }
  2)               |      kvm_tdp_mmu_fast_pf_get_last_sptep [kvm]() {
  2)               |        tdp_iter_start [kvm]() {
  2)   0.101 us    |          tdp_iter_restart [kvm]();
  2)   0.353 us    |        }
  2)   0.214 us    |        tdp_iter_next [kvm]();
  2)   0.146 us    |        tdp_iter_next [kvm]();
  2)   0.141 us    |        tdp_iter_next [kvm]();
  2)   0.063 us    |        tdp_iter_next [kvm]();
  2)   1.338 us    |      }
  2)               |      walk_shadow_page_lockless_end [kvm]() {
  2)   0.068 us    |        __rcu_read_unlock();
  2)   0.189 us    |      }
  2)   2.871 us    |    }
  2)               |    mmu_topup_memory_caches [kvm]() {
  2)               |      kvm_mmu_topup_memory_cache [kvm]() {
  2)   0.071 us    |        __kvm_mmu_topup_memory_cache [kvm]();
  2)   0.686 us    |      }
  2)               |      kvm_mmu_topup_memory_cache [kvm]() {
  2)   0.058 us    |        __kvm_mmu_topup_memory_cache [kvm]();
  2)   0.166 us    |      }
  2)               |      kvm_mmu_topup_memory_cache [kvm]() {
  2)   0.057 us    |        __kvm_mmu_topup_memory_cache [kvm]();
  2)   0.165 us    |      }
  2)   1.353 us    |    }
  2)               |    kvm_faultin_pfn [kvm]() {
  2)               |      __gfn_to_pfn_memslot [kvm]() {
  2)               |        hva_to_pfn [kvm]() {
  2)               |          get_user_pages_fast_only() {
  2)   0.072 us    |            is_valid_gup_args();
  2)               |            internal_get_user_pages_fast() {
  2)   0.069 us    |              pud_huge();
  2)   0.233 us    |              try_grab_folio();
  2)   0.634 us    |            }
  2)   0.878 us    |          }
  2)   1.011 us    |        }
  2)   1.145 us    |      }
  2)   1.298 us    |    }
  2)   0.095 us    |    _raw_read_lock();
  2)   0.080 us    |    is_page_fault_stale [kvm]();
  2)               |    kvm_tdp_mmu_map [kvm]() {
  2)               |      kvm_mmu_hugepage_adjust [kvm]() {
  2)   0.204 us    |        __kvm_mmu_max_mapping_level [kvm]();
  2)   0.349 us    |      }
  2)   0.057 us    |      __rcu_read_lock();
  2)               |      tdp_iter_start [kvm]() {
  2)   0.059 us    |        tdp_iter_restart [kvm]();
  2)   0.165 us    |      }
  2)   0.096 us    |      tdp_iter_next [kvm]();
  2)   0.986 us    |      tdp_iter_next [kvm]();
  2)   0.081 us    |      tdp_iter_next [kvm]();
  2)               |      make_spte [kvm]() {
  2)   0.130 us    |        kvm_is_mmio_pfn [kvm]();
  2)               |        vmx_get_mt_mask [kvm_intel]() {
  2)   0.067 us    |          kvm_arch_has_noncoherent_dma [kvm]();
  2)   0.197 us    |        }
  2)               |        mmu_try_to_unsync_pages [kvm]() {
  2)   0.066 us    |          kvm_gfn_is_write_tracked [kvm]();
  2)   0.285 us    |        }
  2)   0.999 us    |      }
  2)   0.756 us    |      handle_changed_spte [kvm]();
  2)   0.067 us    |      __rcu_read_unlock();
  2)   4.573 us    |    }
  2)   0.068 us    |    _raw_read_unlock();
  2)               |    kvm_release_pfn_clean [kvm]() {
  2)   0.110 us    |      kvm_pfn_to_refcounted_page [kvm]();
  2)               |      kvm_release_page_clean [kvm]() {
  2)               |        mark_page_accessed() {
  2)   0.080 us    |          folio_mark_accessed();
  2)   0.190 us    |        }
  2)   0.349 us    |      }
  2)   0.643 us    |    }
  2) + 16.619 us   |  }

intel 在虚拟机也是支持 dirty log 的,有趣哇 nested_vmx_write_pml_buffer

观察下会存在这些区别吗?

void kvm_init_mmu(struct kvm_vcpu *vcpu)
{
	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);

	if (mmu_is_nested(vcpu))
		init_kvm_nested_mmu(vcpu, cpu_role);
	else if (tdp_enabled)
		init_kvm_tdp_mmu(vcpu, cpu_role);
	else
		init_kvm_softmmu(vcpu, cpu_role);
}

在 L1 中启动 L2 后,并没有观察到 kvm_init_mmu 中走到 init_kvm_nested_mmu ,真刺激

有趣的 backtrace

@[
    vmx_switch_vmcs+1
    nested_vmx_vmexit+317
    nested_ept_inject_page_fault+76
    ept_page_fault+482
    kvm_mmu_page_fault+313
    vmx_handle_exit+301
    kvm_arch_vcpu_ioctl_run+1718
    kvm_vcpu_ioctl+587
    __x64_sys_ioctl+148
    do_syscall_64+197
    entry_SYSCALL_64_after_hwframe+111
]: 266
@[
    vmx_switch_vmcs+1
    nested_vmx_vmexit+317
    nested_vmx_reflect_vmexit+741
    vmx_handle_exit+116
    kvm_arch_vcpu_ioctl_run+1718
    kvm_vcpu_ioctl+587
    __x64_sys_ioctl+148
    do_syscall_64+197
    entry_SYSCALL_64_after_hwframe+111
]: 6815
@[
    vmx_switch_vmcs+1
    nested_vmx_enter_non_root_mode+375
    nested_vmx_run+291
    vmx_handle_exit+301
    kvm_arch_vcpu_ioctl_run+1718
    kvm_vcpu_ioctl+587
    __x64_sys_ioctl+148
    do_syscall_64+197
    entry_SYSCALL_64_after_hwframe+111
]: 7235

万万没有想到,难道是硬件支持无限

sync_vmcs02_to_vmcs12

看看这两个注释

	/*
	 * Newly recognized interrupts are injected via either virtual interrupt
	 * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
	 * disabled in two cases:
	 *
	 * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
	 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
	 * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
	 * into L2, but KVM doesn't use virtual interrupt delivery to inject
	 * interrupts into L2, and so KVM_REQ_EVENT is again needed.
	 *
	 * 2) If APICv is disabled for this vCPU, assigned devices may still
	 * attempt to post interrupts.  The posted interrupt vector will cause
	 * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
	 */
	if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
		vmx_set_rvi(max_irr);
	else if (got_posted_interrupt)
		kvm_make_request(KVM_REQ_EVENT, vcpu);
void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
	/*
	 * When running L2, updating RVI is only relevant when
	 * vmcs12 virtual-interrupt-delivery enabled.
	 * However, it can be enabled only when L1 also
	 * intercepts external-interrupts and in that case
	 * we should not update vmcs02 RVI but instead intercept
	 * interrupt. Therefore, do nothing when running L2.
	 */
	if (!is_guest_mode(vcpu))
		vmx_set_rvi(max_irr);
}

看看这个函数

vmx_deliver_nested_posted_interrupt 还有 vmx_has_nested_events

嵌套还有 posted interupt,真的太可怕了

理解一下这个注释

vmx_update_exception_bitmap

	/* When we are running a nested L2 guest and L1 specified for it a
	 * certain exception bitmap, we must trap the same exceptions and pass
	 * them to L1. When running L2, we will only handle the exceptions
	 * specified above if L1 did not want them.
	 */
	if (is_guest_mode(vcpu))
		eb |= get_vmcs12(vcpu)->exception_bitmap;

本站所有文章转发 CSDN 将按侵权追究法律责任,其它情况随意。