Files
linux/arch/x86/kvm/vmx/main.c
Yan Zhao fbb4adadea KVM: x86: Make cpu_dirty_log_size a per-VM value
Make cpu_dirty_log_size (CPU's dirty log buffer size) a per-VM value and
set the per-VM cpu_dirty_log_size only for normal VMs when PML is enabled.
Do not set it for TDs.

Until now, cpu_dirty_log_size was a system-wide value that is used for
all VMs and is set to the PML buffer size when PML was enabled in VMX.
However, PML is not currently supported for TDs, though PML remains
available for normal VMs as long as the feature is supported by hardware
and enabled in VMX.

Making cpu_dirty_log_size a per-VM value allows it to be ther PML buffer
size for normal VMs and 0 for TDs. This allows functions like
kvm_arch_sync_dirty_log() and kvm_mmu_update_cpu_dirty_logging() to
determine if PML is supported, in order to kick off vCPUs or request them
to update CPU dirty logging status (turn on/off PML in VMCS).

This fixes an issue first reported in [1], where QEMU attaches an
emulated VGA device to a TD; note that KVM_MEM_LOG_DIRTY_PAGES
still works if the corresponding has no flag KVM_MEM_GUEST_MEMFD.
KVM then invokes kvm_mmu_update_cpu_dirty_logging() and from there
vmx_update_cpu_dirty_logging(), which incorrectly accesses a kvm_vmx
struct for a TDX VM.

Reported-by: ANAND NARSHINHA PATIL <Anand.N.Patil@ibm.com>
Reported-by: Pedro Principeza <pedro.principeza@canonical.com>
Reported-by: Farrah Chen <farrah.chen@intel.com>
Closes: https://github.com/canonical/tdx/issues/202
Link: https://github.com/canonical/tdx/issues/202 [1]
Suggested-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2025-03-14 14:20:53 -04:00

424 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/moduleparam.h>
#include "x86_ops.h"
#include "vmx.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
#include "posted_intr.h"
#include "tdx.h"
#include "tdx_arch.h"
static void vt_disable_virtualization_cpu(void)
{
/* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
if (enable_tdx)
tdx_disable_virtualization_cpu();
vmx_disable_virtualization_cpu();
}
static __init int vt_hardware_setup(void)
{
int ret;
ret = vmx_hardware_setup();
if (ret)
return ret;
/*
* Update vt_x86_ops::vm_size here so it is ready before
* kvm_ops_update() is called in kvm_x86_vendor_init().
*
* Note, the actual bringing up of TDX must be done after
* kvm_ops_update() because enabling TDX requires enabling
* hardware virtualization first, i.e., all online CPUs must
* be in post-VMXON state. This means the @vm_size here
* may be updated to TDX's size but TDX may fail to enable
* at later time.
*
* The VMX/VT code could update kvm_x86_ops::vm_size again
* after bringing up TDX, but this would require exporting
* either kvm_x86_ops or kvm_ops_update() from the base KVM
* module, which looks overkill. Anyway, the worst case here
* is KVM may allocate couple of more bytes than needed for
* each VM.
*/
if (enable_tdx) {
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
sizeof(struct kvm_tdx));
/*
* Note, TDX may fail to initialize in a later time in
* vt_init(), in which case it is not necessary to setup
* those callbacks. But making them valid here even
* when TDX fails to init later is fine because those
* callbacks won't be called if the VM isn't TDX guest.
*/
vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
}
return 0;
}
static int vt_vm_init(struct kvm *kvm)
{
if (is_td(kvm))
return tdx_vm_init(kvm);
return vmx_vm_init(kvm);
}
static void vt_vm_pre_destroy(struct kvm *kvm)
{
if (is_td(kvm))
return tdx_mmu_release_hkid(kvm);
}
static void vt_vm_destroy(struct kvm *kvm)
{
if (is_td(kvm))
return tdx_vm_destroy(kvm);
vmx_vm_destroy(kvm);
}
static int vt_vcpu_precreate(struct kvm *kvm)
{
if (is_td(kvm))
return 0;
return vmx_vcpu_precreate(kvm);
}
static int vt_vcpu_create(struct kvm_vcpu *vcpu)
{
if (is_td_vcpu(vcpu))
return tdx_vcpu_create(vcpu);
return vmx_vcpu_create(vcpu);
}
static void vt_vcpu_free(struct kvm_vcpu *vcpu)
{
if (is_td_vcpu(vcpu)) {
tdx_vcpu_free(vcpu);
return;
}
vmx_vcpu_free(vcpu);
}
static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
if (is_td_vcpu(vcpu))
return;
vmx_vcpu_reset(vcpu, init_event);
}
static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
if (is_td_vcpu(vcpu)) {
tdx_vcpu_load(vcpu, cpu);
return;
}
vmx_vcpu_load(vcpu, cpu);
}
static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
{
if (is_td_vcpu(vcpu)) {
tdx_flush_tlb_all(vcpu);
return;
}
vmx_flush_tlb_all(vcpu);
}
static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
{
if (is_td_vcpu(vcpu)) {
tdx_flush_tlb_current(vcpu);
return;
}
vmx_flush_tlb_current(vcpu);
}
static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
if (is_td_vcpu(vcpu))
return;
vmx_flush_tlb_gva(vcpu, addr);
}
static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
if (is_td_vcpu(vcpu))
return;
vmx_flush_tlb_guest(vcpu);
}
static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int pgd_level)
{
if (is_td_vcpu(vcpu)) {
tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
return;
}
vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
}
static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
if (!is_td(kvm))
return -ENOTTY;
return tdx_vm_ioctl(kvm, argp);
}
static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
if (!is_td_vcpu(vcpu))
return -EINVAL;
return tdx_vcpu_ioctl(vcpu, argp);
}
static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
{
if (is_td(kvm))
return tdx_gmem_private_max_mapping_level(kvm, pfn);
return 0;
}
#define VMX_REQUIRED_APICV_INHIBITS \
(BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
struct kvm_x86_ops vt_x86_ops __initdata = {
.name = KBUILD_MODNAME,
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_unsetup = vmx_hardware_unsetup,
.enable_virtualization_cpu = vmx_enable_virtualization_cpu,
.disable_virtualization_cpu = vt_disable_virtualization_cpu,
.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vt_vm_init,
.vm_pre_destroy = vt_vm_pre_destroy,
.vm_destroy = vt_vm_destroy,
.vcpu_precreate = vt_vcpu_precreate,
.vcpu_create = vt_vcpu_create,
.vcpu_free = vt_vcpu_free,
.vcpu_reset = vt_vcpu_reset,
.prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vt_vcpu_load,
.vcpu_put = vmx_vcpu_put,
.update_exception_bitmap = vmx_update_exception_bitmap,
.get_feature_msr = vmx_get_feature_msr,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
.get_segment = vmx_get_segment,
.set_segment = vmx_set_segment,
.get_cpl = vmx_get_cpl,
.get_cpl_no_cache = vmx_get_cpl_no_cache,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.is_valid_cr0 = vmx_is_valid_cr0,
.set_cr0 = vmx_set_cr0,
.is_valid_cr4 = vmx_is_valid_cr4,
.set_cr4 = vmx_set_cr4,
.set_efer = vmx_set_efer,
.get_idt = vmx_get_idt,
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
.set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
.get_if_flag = vmx_get_if_flag,
.flush_tlb_all = vt_flush_tlb_all,
.flush_tlb_current = vt_flush_tlb_current,
.flush_tlb_gva = vt_flush_tlb_gva,
.flush_tlb_guest = vt_flush_tlb_guest,
.vcpu_pre_run = vmx_vcpu_pre_run,
.vcpu_run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
.set_interrupt_shadow = vmx_set_interrupt_shadow,
.get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
.inject_irq = vmx_inject_irq,
.inject_nmi = vmx_inject_nmi,
.inject_exception = vmx_inject_exception,
.cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
.nmi_allowed = vmx_nmi_allowed,
.get_nmi_mask = vmx_get_nmi_mask,
.set_nmi_mask = vmx_set_nmi_mask,
.enable_nmi_window = vmx_enable_nmi_window,
.enable_irq_window = vmx_enable_irq_window,
.update_cr8_intercept = vmx_update_cr8_intercept,
.x2apic_icr_is_split = false,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_pre_state_restore = vmx_apicv_pre_state_restore,
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_isr_update = vmx_hwapic_isr_update,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
.get_entry_info = vmx_get_entry_info,
.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
.write_tsc_offset = vmx_write_tsc_offset,
.write_tsc_multiplier = vmx_write_tsc_multiplier,
.load_mmu_pgd = vt_load_mmu_pgd,
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
.pi_start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
.set_hv_timer = vmx_set_hv_timer,
.cancel_hv_timer = vmx_cancel_hv_timer,
#endif
.setup_mce = vmx_setup_mce,
#ifdef CONFIG_KVM_SMM
.smi_allowed = vmx_smi_allowed,
.enter_smm = vmx_enter_smm,
.leave_smm = vmx_leave_smm,
.enable_smi_window = vmx_enable_smi_window,
#endif
.check_emulate_instruction = vmx_check_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
.migrate_timers = vmx_migrate_timers,
.msr_filter_changed = vmx_msr_filter_changed,
.complete_emulated_msr = kvm_complete_insn_gp,
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
.get_untagged_addr = vmx_get_untagged_addr,
.mem_enc_ioctl = vt_mem_enc_ioctl,
.vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
.private_max_mapping_level = vt_gmem_private_max_mapping_level
};
struct kvm_x86_init_ops vt_init_ops __initdata = {
.hardware_setup = vt_hardware_setup,
.handle_intel_pt_intr = NULL,
.runtime_ops = &vt_x86_ops,
.pmu_ops = &intel_pmu_ops,
};
static void __exit vt_exit(void)
{
kvm_exit();
tdx_cleanup();
vmx_exit();
}
module_exit(vt_exit);
static int __init vt_init(void)
{
unsigned vcpu_size, vcpu_align;
int r;
r = vmx_init();
if (r)
return r;
/* tdx_init() has been taken */
r = tdx_bringup();
if (r)
goto err_tdx_bringup;
/*
* TDX and VMX have different vCPU structures. Calculate the
* maximum size/align so that kvm_init() can use the larger
* values to create the kmem_vcpu_cache.
*/
vcpu_size = sizeof(struct vcpu_vmx);
vcpu_align = __alignof__(struct vcpu_vmx);
if (enable_tdx) {
vcpu_size = max_t(unsigned, vcpu_size,
sizeof(struct vcpu_tdx));
vcpu_align = max_t(unsigned, vcpu_align,
__alignof__(struct vcpu_tdx));
}
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!
*/
r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE);
if (r)
goto err_kvm_init;
return 0;
err_kvm_init:
tdx_cleanup();
err_tdx_bringup:
vmx_exit();
return r;
}
module_init(vt_init);