diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 03a550630644..44854a67bc63 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3247,8 +3247,8 @@ Kernel parameters for the host. To force nVHE on VHE hardware, add "arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" to the command-line. - "nested" is experimental and should be used with - extreme caution. + "nested" and "protected" are experimental and should be + used with extreme caution. kvm-arm.vgic_v3_group0_trap= [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0 diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index 338bc4d7cfab..036db96864d2 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -91,6 +91,17 @@ interactions. user_events uprobetracer +Remote Tracing +-------------- + +This section covers the framework to read compatible ring-buffers, written by +entities outside of the kernel (most likely firmware or hypervisor) + +.. toctree:: + :maxdepth: 1 + + remotes + Additional Resources -------------------- diff --git a/Documentation/trace/remotes.rst b/Documentation/trace/remotes.rst new file mode 100644 index 000000000000..1f9d764f69aa --- /dev/null +++ b/Documentation/trace/remotes.rst @@ -0,0 +1,66 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Tracing Remotes +=============== + +:Author: Vincent Donnefort + +Overview +======== +Firmware and hypervisors are black boxes to the kernel. Having a way to see what +they are doing can be useful to debug both. This is where remote tracing buffers +come in. A remote tracing buffer is a ring buffer executed by the firmware or +hypervisor into memory that is memory mapped to the host kernel. This is similar +to how user space memory maps the kernel ring buffer but in this case the kernel +is acting like user space and the firmware or hypervisor is the "kernel" side. +With a trace remote ring buffer, the firmware and hypervisor can record events +for which the host kernel can see and expose to user space. + +Register a remote +================= +A remote must provide a set of callbacks `struct trace_remote_callbacks` whom +description can be found below. Those callbacks allows Tracefs to enable and +disable tracing and events, to load and unload a tracing buffer (a set of +ring-buffers) and to swap a reader page with the head page, which enables +consuming reading. + +.. kernel-doc:: include/linux/trace_remote.h + +Once registered, an instance will appear for this remote in the Tracefs +directory **remotes/**. Buffers can then be read using the usual Tracefs files +**trace_pipe** and **trace**. + +Declare a remote event +====================== +Macros are provided to ease the declaration of remote events, in a similar +fashion to in-kernel events. A declaration must provide an ID, a description of +the event arguments and how to print the event: + +.. code-block:: c + + REMOTE_EVENT(foo, EVENT_FOO_ID, + RE_STRUCT( + re_field(u64, bar) + ), + RE_PRINTK("bar=%lld", __entry->bar) + ); + +Then those events must be declared in a C file with the following: + +.. code-block:: c + + #define REMOTE_EVENT_INCLUDE_FILE foo_events.h + #include + +This will provide a `struct remote_event remote_event_foo` that can be given to +`trace_remote_register`. + +Registered events appear in the remote directory under **events/**. + +Simple ring-buffer +================== +A simple implementation for a ring-buffer writer can be found in +kernel/trace/simple_ring_buffer.c. + +.. kernel-doc:: include/linux/simple_ring_buffer.h diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 032516783e96..03d87d9b97d9 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -907,10 +907,12 @@ The irq_type field has the following values: - KVM_ARM_IRQ_TYPE_CPU: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ - KVM_ARM_IRQ_TYPE_SPI: - in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.) + in-kernel GICv2/GICv3: SPI, irq_id between 32 and 1019 (incl.) (the vcpu_index field is ignored) + in-kernel GICv5: SPI, irq_id between 0 and 65535 (incl.) - KVM_ARM_IRQ_TYPE_PPI: - in-kernel GIC: PPI, irq_id between 16 and 31 (incl.) + in-kernel GICv2/GICv3: PPI, irq_id between 16 and 31 (incl.) + in-kernel GICv5: PPI, irq_id between 0 and 127 (incl.) (The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst index ec09881de4cf..0856b4942e05 100644 --- a/Documentation/virt/kvm/arm/index.rst +++ b/Documentation/virt/kvm/arm/index.rst @@ -10,6 +10,7 @@ ARM fw-pseudo-registers hyp-abi hypercalls + pkvm pvtime ptp_kvm vcpu-features diff --git a/Documentation/virt/kvm/arm/pkvm.rst b/Documentation/virt/kvm/arm/pkvm.rst new file mode 100644 index 000000000000..514992a79a83 --- /dev/null +++ b/Documentation/virt/kvm/arm/pkvm.rst @@ -0,0 +1,106 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Protected KVM (pKVM) +==================== + +**NOTE**: pKVM is currently an experimental, development feature and +subject to breaking changes as new isolation features are implemented. +Please reach out to the developers at kvmarm@lists.linux.dev if you have +any questions. + +Overview +======== + +Booting a host kernel with '``kvm-arm.mode=protected``' enables +"Protected KVM" (pKVM). During boot, pKVM installs a stage-2 identity +map page-table for the host and uses it to isolate the hypervisor +running at EL2 from the rest of the host running at EL1/0. + +pKVM permits creation of protected virtual machines (pVMs) by passing +the ``KVM_VM_TYPE_ARM_PROTECTED`` machine type identifier to the +``KVM_CREATE_VM`` ioctl(). The hypervisor isolates pVMs from the host by +unmapping pages from the stage-2 identity map as they are accessed by a +pVM. Hypercalls are provided for a pVM to share specific regions of its +IPA space back with the host, allowing for communication with the VMM. +A Linux guest must be configured with ``CONFIG_ARM_PKVM_GUEST=y`` in +order to issue these hypercalls. + +See hypercalls.rst for more details. + +Isolation mechanisms +==================== + +pKVM relies on a number of mechanisms to isolate PVMs from the host: + +CPU memory isolation +-------------------- + +Status: Isolation of anonymous memory and metadata pages. + +Metadata pages (e.g. page-table pages and '``struct kvm_vcpu``' pages) +are donated from the host to the hypervisor during pVM creation and +are consequently unmapped from the stage-2 identity map until the pVM is +destroyed. + +Similarly to regular KVM, pages are lazily mapped into the guest in +response to stage-2 page faults handled by the host. However, when +running a pVM, these pages are first pinned and then unmapped from the +stage-2 identity map as part of the donation procedure. This gives rise +to some user-visible differences when compared to non-protected VMs, +largely due to the lack of MMU notifiers: + +* Memslots cannot be moved or deleted once the pVM has started running. +* Read-only memslots and dirty logging are not supported. +* With the exception of swap, file-backed pages cannot be mapped into a + pVM. +* Donated pages are accounted against ``RLIMIT_MLOCK`` and so the VMM + must have a sufficient resource limit or be granted ``CAP_IPC_LOCK``. + The lack of a runtime reclaim mechanism means that memory locked for + a pVM will remain locked until the pVM is destroyed. +* Changes to the VMM address space (e.g. a ``MAP_FIXED`` mmap() over a + mapping associated with a memslot) are not reflected in the guest and + may lead to loss of coherency. +* Accessing pVM memory that has not been shared back will result in the + delivery of a SIGSEGV. +* If a system call accesses pVM memory that has not been shared back + then it will either return ``-EFAULT`` or forcefully reclaim the + memory pages. Reclaimed memory is zeroed by the hypervisor and a + subsequent attempt to access it in the pVM will return ``-EFAULT`` + from the ``VCPU_RUN`` ioctl(). + +CPU state isolation +------------------- + +Status: **Unimplemented.** + +DMA isolation using an IOMMU +---------------------------- + +Status: **Unimplemented.** + +Proxying of Trustzone services +------------------------------ + +Status: FF-A and PSCI calls from the host are proxied by the pKVM +hypervisor. + +The FF-A proxy ensures that the host cannot share pVM or hypervisor +memory with Trustzone as part of a "confused deputy" attack. + +The PSCI proxy ensures that CPUs always have the stage-2 identity map +installed when they are executing in the host. + +Protected VM firmware (pvmfw) +----------------------------- + +Status: **Unimplemented.** + +Resources +========= + +Quentin Perret's KVM Forum 2022 talk entitled "Protected KVM on arm64: A +technical deep dive" remains a good resource for learning more about +pKVM, despite some of the details having changed in the meantime: + +https://www.youtube.com/watch?v=9npebeVFbFw diff --git a/Documentation/virt/kvm/devices/arm-vgic-v5.rst b/Documentation/virt/kvm/devices/arm-vgic-v5.rst new file mode 100644 index 000000000000..29335ea823fc --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic-v5.rst @@ -0,0 +1,50 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================================== +ARM Virtual Generic Interrupt Controller v5 (VGICv5) +==================================================== + + +Device types supported: + - KVM_DEV_TYPE_ARM_VGIC_V5 ARM Generic Interrupt Controller v5.0 + +Only one VGIC instance may be instantiated through this API. The created VGIC +will act as the VM interrupt controller, requiring emulated user-space devices +to inject interrupts to the VGIC instead of directly to CPUs. + +Creating a guest GICv5 device requires a host GICv5 host. The current VGICv5 +device only supports PPI interrupts. These can either be injected from emulated +in-kernel devices (such as the Arch Timer, or PMU), or via the KVM_IRQ_LINE +ioctl. + +Groups: + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC, no additional parameter in + kvm_device_attr.addr. Must be called after all VCPUs have been created. + + KVM_DEV_ARM_VGIC_USERPSPACE_PPIs + request the mask of userspace-drivable PPIs. Only a subset of the PPIs can + be directly driven from userspace with GICv5, and the returned mask + informs userspace of which it is allowed to drive via KVM_IRQ_LINE. + + Userspace must allocate and point to __u64[2] of data in + kvm_device_attr.addr. When this call returns, the provided memory will be + populated with the userspace PPI mask. The lower __u64 contains the mask + for the lower 64 PPIS, with the remaining 64 being in the second __u64. + + This is a read-only attribute, and cannot be set. Attempts to set it are + rejected. + + Errors: + + ======= ======================================================== + -ENXIO VGIC not properly configured as required prior to calling + this attribute + -ENODEV no online VCPU + -ENOMEM memory shortage when allocating vgic internal data + -EFAULT Invalid guest ram access + -EBUSY One or more VCPUS are running + ======= ======================================================== diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 192cda7405c8..70845aba38f4 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -10,6 +10,7 @@ Devices arm-vgic-its arm-vgic arm-vgic-v3 + arm-vgic-v5 mpic s390_flic vcpu diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 60bf205cb373..5e3805820010 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -37,7 +37,8 @@ Returns: A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt type must be same for each vcpu. As a PPI, the interrupt number is the same for -all vcpus, while as an SPI it must be a separate number per vcpu. +all vcpus, while as an SPI it must be a separate number per vcpu. For +GICv5-based guests, the architected PPI (23) must be used. 1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT --------------------------------------- @@ -50,7 +51,7 @@ Returns: -EEXIST Interrupt number already used -ENODEV PMUv3 not supported or GIC not initialized -ENXIO PMUv3 not supported, missing VCPU feature or interrupt - number not set + number not set (non-GICv5 guests, only) -EBUSY PMUv3 already initialized ======= ====================================================== diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 85f4c1615472..144a757e937f 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -50,7 +50,6 @@ * effectively VHE-only or not. */ msr_hcr_el2 x0 // Setup HCR_EL2 as nVHE - isb mov x1, #1 // Write something to FAR_EL1 msr far_el1, x1 isb @@ -64,7 +63,6 @@ .LnE2H0_\@: orr x0, x0, #HCR_E2H msr_hcr_el2 x0 - isb .LnVHE_\@: .endm @@ -248,6 +246,8 @@ ICH_HFGWTR_EL2_ICC_CR0_EL1 | \ ICH_HFGWTR_EL2_ICC_APR_EL1) msr_s SYS_ICH_HFGWTR_EL2, x0 // Disable reg write traps + mov x0, #(ICH_VCTLR_EL2_En) + msr_s SYS_ICH_VCTLR_EL2, x0 // Enable vHPPI selection .Lskip_gicv5_\@: .endm diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index a1ad12c72ebf..37414440cee7 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -51,7 +51,7 @@ #include enum __kvm_host_smccc_func { - /* Hypercalls available only prior to pKVM finalisation */ + /* Hypercalls that are unavailable once pKVM has finalised. */ /* __KVM_HOST_SMCCC_FUNC___kvm_hyp_init */ __KVM_HOST_SMCCC_FUNC___pkvm_init = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1, __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping, @@ -60,16 +60,9 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs, __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config, __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize, + __KVM_HOST_SMCCC_FUNC_MIN_PKVM = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize, - /* Hypercalls available after pKVM finalisation */ - __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, - __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, - __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest, - __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest, - __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest, - __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest, - __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest, - __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest, + /* Hypercalls that are always available and common to [nh]VHE/pKVM. */ __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, @@ -81,14 +74,40 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v5_save_apr, + __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr, + __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM = __KVM_HOST_SMCCC_FUNC___vgic_v5_restore_vmcr_apr, + + /* Hypercalls that are available only when pKVM has finalised. */ + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, + __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, + __KVM_HOST_SMCCC_FUNC___pkvm_host_donate_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest, __KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, - __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_in_poison_fault, + __KVM_HOST_SMCCC_FUNC___pkvm_force_reclaim_guest_page, + __KVM_HOST_SMCCC_FUNC___pkvm_reclaim_dying_guest_page, + __KVM_HOST_SMCCC_FUNC___pkvm_start_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_finalize_teardown_vm, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put, __KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid, + __KVM_HOST_SMCCC_FUNC___tracing_load, + __KVM_HOST_SMCCC_FUNC___tracing_unload, + __KVM_HOST_SMCCC_FUNC___tracing_enable, + __KVM_HOST_SMCCC_FUNC___tracing_swap_reader, + __KVM_HOST_SMCCC_FUNC___tracing_update_clock, + __KVM_HOST_SMCCC_FUNC___tracing_reset, + __KVM_HOST_SMCCC_FUNC___tracing_enable_event, + __KVM_HOST_SMCCC_FUNC___tracing_write_event, }; #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] @@ -291,7 +310,8 @@ asmlinkage void __noreturn hyp_panic_bad_stack(void); asmlinkage void kvm_unexpected_el2_exception(void); struct kvm_cpu_context; void handle_trap(struct kvm_cpu_context *host_ctxt); -asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on); +asmlinkage void __noreturn __kvm_host_psci_cpu_on_entry(void); +asmlinkage void __noreturn __kvm_host_psci_cpu_resume_entry(void); void __noreturn __pkvm_init_finalise(void); void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); void kvm_patch_vector_branch(struct alt_instr *alt, diff --git a/arch/arm64/include/asm/kvm_define_hypevents.h b/arch/arm64/include/asm/kvm_define_hypevents.h new file mode 100644 index 000000000000..77d6790252a6 --- /dev/null +++ b/arch/arm64/include/asm/kvm_define_hypevents.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define REMOTE_EVENT_INCLUDE_FILE arch/arm64/include/asm/kvm_hypevents.h + +#define REMOTE_EVENT_SECTION "_hyp_events" + +#define HE_STRUCT(__args) __args +#define HE_PRINTK(__args...) __args +#define he_field re_field + +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + REMOTE_EVENT(__name, 0, RE_STRUCT(__struct), RE_PRINTK(__printk)) + +#define HYP_EVENT_MULTI_READ +#include +#undef HYP_EVENT_MULTI_READ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 70cb9cfd760a..851f6171751c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -217,6 +217,10 @@ struct kvm_s2_mmu { */ bool nested_stage2_enabled; +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS + struct dentry *shadow_pt_debugfs_dentry; +#endif + /* * true when this MMU needs to be unmapped before being used for a new * purpose. @@ -247,7 +251,7 @@ struct kvm_smccc_features { unsigned long vendor_hyp_bmap_2; /* Function numbers 64-127 */ }; -typedef unsigned int pkvm_handle_t; +typedef u16 pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; @@ -255,6 +259,13 @@ struct kvm_protected_vm { struct kvm_hyp_memcache stage2_teardown_mc; bool is_protected; bool is_created; + + /* + * True when the guest is being torn down. When in this state, the + * guest's vCPUs can't be loaded anymore, but its pages can be + * reclaimed by the host. + */ + bool is_dying; }; struct kvm_mpidr_data { @@ -287,6 +298,9 @@ enum fgt_group_id { HDFGRTR2_GROUP, HDFGWTR2_GROUP = HDFGRTR2_GROUP, HFGITR2_GROUP, + ICH_HFGRTR_GROUP, + ICH_HFGWTR_GROUP = ICH_HFGRTR_GROUP, + ICH_HFGITR_GROUP, /* Must be last */ __NR_FGT_GROUP_IDS__ @@ -405,6 +419,11 @@ struct kvm_arch { * the associated pKVM instance in the hypervisor. */ struct kvm_protected_vm pkvm; + +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS + /* Nested virtualization info */ + struct dentry *debugfs_nv_dentry; +#endif }; struct kvm_vcpu_fault_info { @@ -620,6 +639,10 @@ enum vcpu_sysreg { VNCR(ICH_HCR_EL2), VNCR(ICH_VMCR_EL2), + VNCR(ICH_HFGRTR_EL2), + VNCR(ICH_HFGWTR_EL2), + VNCR(ICH_HFGITR_EL2), + NR_SYS_REGS /* Nothing after this line! */ }; @@ -675,6 +698,9 @@ extern struct fgt_masks hfgwtr2_masks; extern struct fgt_masks hfgitr2_masks; extern struct fgt_masks hdfgrtr2_masks; extern struct fgt_masks hdfgwtr2_masks; +extern struct fgt_masks ich_hfgrtr_masks; +extern struct fgt_masks ich_hfgwtr_masks; +extern struct fgt_masks ich_hfgitr_masks; extern struct fgt_masks kvm_nvhe_sym(hfgrtr_masks); extern struct fgt_masks kvm_nvhe_sym(hfgwtr_masks); @@ -687,6 +713,9 @@ extern struct fgt_masks kvm_nvhe_sym(hfgwtr2_masks); extern struct fgt_masks kvm_nvhe_sym(hfgitr2_masks); extern struct fgt_masks kvm_nvhe_sym(hdfgrtr2_masks); extern struct fgt_masks kvm_nvhe_sym(hdfgwtr2_masks); +extern struct fgt_masks kvm_nvhe_sym(ich_hfgrtr_masks); +extern struct fgt_masks kvm_nvhe_sym(ich_hfgwtr_masks); +extern struct fgt_masks kvm_nvhe_sym(ich_hfgitr_masks); struct kvm_cpu_context { struct user_pt_regs regs; /* sp = sp_el0 */ @@ -768,8 +797,10 @@ struct kvm_host_data { struct kvm_guest_debug_arch regs; /* Statistical profiling extension */ u64 pmscr_el1; + u64 pmblimitr_el1; /* Self-hosted trace */ u64 trfcr_el1; + u64 trblimitr_el1; /* Values of trap registers for the host before guest entry. */ u64 mdcr_el2; u64 brbcr_el1; @@ -787,6 +818,14 @@ struct kvm_host_data { /* Last vgic_irq part of the AP list recorded in an LR */ struct vgic_irq *last_lr_irq; + + /* PPI state tracking for GICv5-based guests */ + struct { + DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS); + + /* The saved state of the regs when leaving the guest */ + DECLARE_BITMAP(activer_exit, VGIC_V5_NR_PRIVATE_IRQS); + } vgic_v5_ppi_state; }; struct kvm_host_psci_config { @@ -923,6 +962,9 @@ struct kvm_vcpu_arch { /* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */ struct vncr_tlb *vncr_tlb; + + /* Hyp-readable copy of kvm_vcpu::pid */ + pid_t pid; }; /* @@ -1659,6 +1701,11 @@ static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg case HDFGRTR2_EL2: case HDFGWTR2_EL2: return HDFGRTR2_GROUP; + case ICH_HFGRTR_EL2: + case ICH_HFGWTR_EL2: + return ICH_HFGRTR_GROUP; + case ICH_HFGITR_EL2: + return ICH_HFGITR_GROUP; default: BUILD_BUG_ON(1); } @@ -1673,6 +1720,7 @@ static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg case HDFGWTR_EL2: \ case HFGWTR2_EL2: \ case HDFGWTR2_EL2: \ + case ICH_HFGWTR_EL2: \ p = &(vcpu)->arch.fgt[id].w; \ break; \ default: \ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 76ce2b94bd97..8d06b62e7188 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -87,6 +87,15 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); +/* GICv5 */ +void __vgic_v5_save_apr(struct vgic_v5_cpu_if *cpu_if); +void __vgic_v5_restore_vmcr_apr(struct vgic_v5_cpu_if *cpu_if); +/* No hypercalls for the following */ +void __vgic_v5_save_ppi_state(struct vgic_v5_cpu_if *cpu_if); +void __vgic_v5_restore_ppi_state(struct vgic_v5_cpu_if *cpu_if); +void __vgic_v5_save_state(struct vgic_v5_cpu_if *cpu_if); +void __vgic_v5_restore_state(struct vgic_v5_cpu_if *cpu_if); + #ifdef __KVM_NVHE_HYPERVISOR__ void __timer_enable_traps(struct kvm_vcpu *vcpu); void __timer_disable_traps(struct kvm_vcpu *vcpu); @@ -129,13 +138,13 @@ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, #ifdef __KVM_NVHE_HYPERVISOR__ void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp, void (*fn)(void)); -int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, - unsigned long *per_cpu_base, u32 hyp_va_bits); +int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits); void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); #endif extern u64 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64pfr2_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64isar0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64isar1_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64isar2_el1_sys_val); @@ -147,5 +156,6 @@ extern u64 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val); extern unsigned long kvm_nvhe_sym(__icache_flags); extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits); extern unsigned int kvm_nvhe_sym(kvm_host_sve_max_vl); +extern unsigned long kvm_nvhe_sym(hyp_nr_cpus); #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h new file mode 100644 index 000000000000..743c49bd878f --- /dev/null +++ b/arch/arm64/include/asm/kvm_hypevents.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if !defined(__ARM64_KVM_HYPEVENTS_H_) || defined(HYP_EVENT_MULTI_READ) +#define __ARM64_KVM_HYPEVENTS_H_ + +#ifdef __KVM_NVHE_HYPERVISOR__ +#include +#endif + +#ifndef __HYP_ENTER_EXIT_REASON +#define __HYP_ENTER_EXIT_REASON +enum hyp_enter_exit_reason { + HYP_REASON_SMC, + HYP_REASON_HVC, + HYP_REASON_PSCI, + HYP_REASON_HOST_ABORT, + HYP_REASON_GUEST_EXIT, + HYP_REASON_ERET_HOST, + HYP_REASON_ERET_GUEST, + HYP_REASON_UNKNOWN /* Must be last */ +}; +#endif + +HYP_EVENT(hyp_enter, + HE_PROTO(struct kvm_cpu_context *host_ctxt, u8 reason), + HE_STRUCT( + he_field(u8, reason) + he_field(pid_t, vcpu) + ), + HE_ASSIGN( + __entry->reason = reason; + __entry->vcpu = __tracing_get_vcpu_pid(host_ctxt); + ), + HE_PRINTK("reason=%s vcpu=%d", __hyp_enter_exit_reason_str(__entry->reason), __entry->vcpu) +); + +HYP_EVENT(hyp_exit, + HE_PROTO(struct kvm_cpu_context *host_ctxt, u8 reason), + HE_STRUCT( + he_field(u8, reason) + he_field(pid_t, vcpu) + ), + HE_ASSIGN( + __entry->reason = reason; + __entry->vcpu = __tracing_get_vcpu_pid(host_ctxt); + ), + HE_PRINTK("reason=%s vcpu=%d", __hyp_enter_exit_reason_str(__entry->reason), __entry->vcpu) +); + +HYP_EVENT(selftest, + HE_PROTO(u64 id), + HE_STRUCT( + he_field(u64, id) + ), + HE_ASSIGN( + __entry->id = id; + ), + RE_PRINTK("id=%llu", __entry->id) +); +#endif diff --git a/arch/arm64/include/asm/kvm_hyptrace.h b/arch/arm64/include/asm/kvm_hyptrace.h new file mode 100644 index 000000000000..de133b735f72 --- /dev/null +++ b/arch/arm64/include/asm/kvm_hyptrace.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ARM64_KVM_HYPTRACE_H_ +#define __ARM64_KVM_HYPTRACE_H_ + +#include + +struct hyp_trace_desc { + unsigned long bpages_backing_start; + size_t bpages_backing_size; + struct trace_buffer_desc trace_buffer_desc; + +}; + +struct hyp_event_id { + unsigned short id; + atomic_t enabled; +}; + +extern struct remote_event __hyp_events_start[]; +extern struct remote_event __hyp_events_end[]; + +/* hyp_event section used by the hypervisor */ +extern struct hyp_event_id __hyp_event_ids_start[]; +extern struct hyp_event_id __hyp_event_ids_end[]; + +#endif diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index d968aca0461a..01e9c72d6aa7 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -393,8 +393,12 @@ static inline bool kvm_supports_cacheable_pfnmap(void) #ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS void kvm_s2_ptdump_create_debugfs(struct kvm *kvm); +void kvm_nested_s2_ptdump_create_debugfs(struct kvm_s2_mmu *mmu); +void kvm_nested_s2_ptdump_remove_debugfs(struct kvm_s2_mmu *mmu); #else static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {} +static inline void kvm_nested_s2_ptdump_create_debugfs(struct kvm_s2_mmu *mmu) {} +static inline void kvm_nested_s2_ptdump_remove_debugfs(struct kvm_s2_mmu *mmu) {} #endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */ #endif /* __ASSEMBLER__ */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c201168f2857..41a8687938eb 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -99,14 +99,30 @@ typedef u64 kvm_pte_t; KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 +/* pKVM invalid pte encodings */ +#define KVM_INVALID_PTE_TYPE_MASK GENMASK(63, 60) +#define KVM_INVALID_PTE_ANNOT_MASK ~(KVM_PTE_VALID | \ + KVM_INVALID_PTE_TYPE_MASK) -/* - * Used to indicate a pte for which a 'break-before-make' sequence is in - * progress. - */ -#define KVM_INVALID_PTE_LOCKED BIT(10) +enum kvm_invalid_pte_type { + /* + * Used to indicate a pte for which a 'break-before-make' + * sequence is in progress. + */ + KVM_INVALID_PTE_TYPE_LOCKED = 1, + + /* + * pKVM has unmapped the page from the host due to a change of + * ownership. + */ + KVM_HOST_INVALID_PTE_TYPE_DONATION, + + /* + * The page has been forcefully reclaimed from the guest by the + * host. + */ + KVM_GUEST_INVALID_PTE_TYPE_POISONED, +}; static inline bool kvm_pte_valid(kvm_pte_t pte) { @@ -658,14 +674,18 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc, enum kvm_pgtable_walk_flags flags); /** - * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to - * track ownership. + * kvm_pgtable_stage2_annotate() - Unmap and annotate pages in the IPA space + * to track ownership (and more). * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Base intermediate physical address to annotate. * @size: Size of the annotated range. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. - * @owner_id: Unique identifier for the owner of the page. + * @type: The type of the annotation, determining its meaning and format. + * @annotation: A 59-bit value that will be stored in the page tables. + * @annotation[0] and @annotation[63:60] must be 0. + * @annotation[59:1] is stored in the page tables, along + * with @type. * * By default, all page-tables are owned by identifier 0. This function can be * used to mark portions of the IPA space as owned by other entities. When a @@ -674,8 +694,9 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, - void *mc, u8 owner_id); +int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, enum kvm_invalid_pte_type type, + kvm_pte_t annotation); /** * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 757076ad4ec9..2954b311128c 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -17,7 +17,7 @@ #define HYP_MEMBLOCK_REGIONS 128 -int pkvm_init_host_vm(struct kvm *kvm); +int pkvm_init_host_vm(struct kvm *kvm, unsigned long type); int pkvm_create_hyp_vm(struct kvm *kvm); bool pkvm_hyp_vm_is_created(struct kvm *kvm); void pkvm_destroy_hyp_vm(struct kvm *kvm); @@ -40,8 +40,6 @@ static inline bool kvm_pkvm_ext_allowed(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPU_ID: case KVM_CAP_MSI_DEVID: case KVM_CAP_ARM_VM_IPA_SIZE: - case KVM_CAP_ARM_PMU_V3: - case KVM_CAP_ARM_SVE: case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: return true; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index f4436ecc630c..736561480f36 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1052,6 +1052,7 @@ #define GICV5_OP_GIC_CDPRI sys_insn(1, 0, 12, 1, 2) #define GICV5_OP_GIC_CDRCFG sys_insn(1, 0, 12, 1, 5) #define GICV5_OP_GICR_CDIA sys_insn(1, 0, 12, 3, 0) +#define GICV5_OP_GICR_CDNMIA sys_insn(1, 0, 12, 3, 1) /* Definitions for GIC CDAFF */ #define GICV5_GIC_CDAFF_IAFFID_MASK GENMASK_ULL(47, 32) @@ -1098,6 +1099,12 @@ #define GICV5_GIC_CDIA_TYPE_MASK GENMASK_ULL(31, 29) #define GICV5_GIC_CDIA_ID_MASK GENMASK_ULL(23, 0) +/* Definitions for GICR CDNMIA */ +#define GICV5_GICR_CDNMIA_VALID_MASK BIT_ULL(32) +#define GICV5_GICR_CDNMIA_VALID(r) FIELD_GET(GICV5_GICR_CDNMIA_VALID_MASK, r) +#define GICV5_GICR_CDNMIA_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GICR_CDNMIA_ID_MASK GENMASK_ULL(23, 0) + #define gicr_insn(insn) read_sysreg_s(GICV5_OP_GICR_##insn) #define gic_insn(v, insn) write_sysreg_s(v, GICV5_OP_GIC_##insn) @@ -1114,11 +1121,9 @@ .macro msr_hcr_el2, reg #if IS_ENABLED(CONFIG_AMPERE_ERRATUM_AC04_CPU_23) dsb nsh - msr hcr_el2, \reg - isb -#else - msr hcr_el2, \reg #endif + msr hcr_el2, \reg + isb // Required by AMPERE_ERRATUM_AC04_CPU_23 .endm #else diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index b51ab6840f9c..b546703c3ab9 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -94,6 +94,15 @@ static inline bool is_pkvm_initialized(void) static_branch_likely(&kvm_protected_mode_initialized); } +#ifdef CONFIG_KVM +bool pkvm_force_reclaim_guest_page(phys_addr_t phys); +#else +static inline bool pkvm_force_reclaim_guest_page(phys_addr_t phys) +{ + return false; +} +#endif + /* Reports the availability of HYP mode */ static inline bool is_hyp_mode_available(void) { diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index c2485a862e69..14366d35ce82 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -108,5 +108,8 @@ #define VNCR_MPAMVPM5_EL2 0x968 #define VNCR_MPAMVPM6_EL2 0x970 #define VNCR_MPAMVPM7_EL2 0x978 +#define VNCR_ICH_HFGITR_EL2 0xB10 +#define VNCR_ICH_HFGRTR_EL2 0xB18 +#define VNCR_ICH_HFGWTR_EL2 0xB20 #endif /* __ARM64_VNCR_MAPPING_H__ */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index a792a599b9d6..1c13bfa2d38a 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -428,6 +428,7 @@ enum { #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 #define KVM_DEV_ARM_ITS_CTRL_RESET 4 +#define KVM_DEV_ARM_VGIC_USERSPACE_PPIS 5 /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 32c2dbcc0c64..1bfaa96881da 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -325,6 +325,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_GCIE_SHIFT, 4, ID_AA64PFR2_EL1_GCIE_NI), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTEFAR_SHIFT, 4, ID_AA64PFR2_EL1_MTEFAR_NI), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_MTESTOREONLY_SHIFT, 4, ID_AA64PFR2_EL1_MTESTOREONLY_NI), ARM64_FTR_END, diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 085bc9972f6b..634ddc904244 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -103,7 +103,6 @@ SYM_CODE_START_LOCAL(__finalise_el2) // Engage the VHE magic! mov_q x0, HCR_HOST_VHE_FLAGS msr_hcr_el2 x0 - isb // Use the EL1 allocated stack, per-cpu offset mrs x0, sp_el1 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index d7b0d12b1015..d4c7d45ae6bc 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -138,6 +138,10 @@ KVM_NVHE_ALIAS(__hyp_data_start); KVM_NVHE_ALIAS(__hyp_data_end); KVM_NVHE_ALIAS(__hyp_rodata_start); KVM_NVHE_ALIAS(__hyp_rodata_end); +#ifdef CONFIG_NVHE_EL2_TRACING +KVM_NVHE_ALIAS(__hyp_event_ids_start); +KVM_NVHE_ALIAS(__hyp_event_ids_end); +#endif /* pKVM static key */ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 2d1e75263f03..e1ac876200a3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -13,12 +13,23 @@ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; +#ifdef CONFIG_NVHE_EL2_TRACING +#define HYPERVISOR_EVENT_IDS \ + . = ALIGN(PAGE_SIZE); \ + __hyp_event_ids_start = .; \ + *(HYP_SECTION_NAME(.event_ids)) \ + __hyp_event_ids_end = .; +#else +#define HYPERVISOR_EVENT_IDS +#endif + #define HYPERVISOR_RODATA_SECTIONS \ HYP_SECTION_NAME(.rodata) : { \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_start = .; \ *(HYP_SECTION_NAME(.data..ro_after_init)) \ *(HYP_SECTION_NAME(.rodata)) \ + HYPERVISOR_EVENT_IDS \ . = ALIGN(PAGE_SIZE); \ __hyp_rodata_end = .; \ } @@ -308,6 +319,13 @@ SECTIONS HYPERVISOR_DATA_SECTION +#ifdef CONFIG_NVHE_EL2_TRACING + .data.hyp_events : { + __hyp_events_start = .; + *(SORT(_hyp_events.*)) + __hyp_events_end = .; + } +#endif /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 7d1f22fd490b..449154f9a485 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -42,32 +42,10 @@ menuconfig KVM If unsure, say N. -config NVHE_EL2_DEBUG - bool "Debug mode for non-VHE EL2 object" - depends on KVM - help - Say Y here to enable the debug mode for the non-VHE KVM EL2 object. - Failure reports will BUG() in the hypervisor. This is intended for - local EL2 hypervisor development. - - If unsure, say N. - -config PROTECTED_NVHE_STACKTRACE - bool "Protected KVM hypervisor stacktraces" - depends on NVHE_EL2_DEBUG - default n - help - Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() - - If using protected nVHE mode, but cannot afford the associated - memory cost (less than 0.75 page per CPU) of pKVM stacktraces, - say N. - - If unsure, or not using protected nVHE (pKVM), say N. +if KVM config PTDUMP_STAGE2_DEBUGFS bool "Present the stage-2 pagetables to debugfs" - depends on KVM depends on DEBUG_KERNEL depends on DEBUG_FS depends on ARCH_HAS_PTDUMP @@ -82,4 +60,48 @@ config PTDUMP_STAGE2_DEBUGFS If in doubt, say N. +config NVHE_EL2_DEBUG + bool "Debug mode for non-VHE EL2 object" + default n + help + Say Y here to enable the debug mode for the non-VHE KVM EL2 object. + Failure reports will BUG() in the hypervisor. This is intended for + local EL2 hypervisor development. + + If unsure, say N. + +if NVHE_EL2_DEBUG + +config NVHE_EL2_TRACING + bool + depends on TRACING && FTRACE + select TRACE_REMOTE + default y + +config PKVM_DISABLE_STAGE2_ON_PANIC + bool "Disable the host stage-2 on panic" + default n + help + Relax the host stage-2 on hypervisor panic to allow the kernel to + unwind and symbolize the hypervisor stacktrace. This however tampers + the system security. This is intended for local EL2 hypervisor + development. + + If unsure, say N. + +config PKVM_STACKTRACE + bool "Protected KVM hypervisor stacktraces" + depends on PKVM_DISABLE_STAGE2_ON_PANIC + default y + help + Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() + + If using protected nVHE mode, but cannot afford the associated + memory cost (less than 0.75 page per CPU) of pKVM stacktraces, + say N. + + If unsure, or not using protected nVHE (pKVM), say N. + +endif # NVHE_EL2_DEBUG +endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 3ebc0570345c..59612d2f277c 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -30,6 +30,8 @@ kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o +kvm-$(CONFIG_NVHE_EL2_TRACING) += hyp_trace.o + always-y := hyp_constants.h hyp-constants.s define rule_gen_hyp_constants diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 600f250753b4..cbea4d9ee955 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -56,6 +56,12 @@ static struct irq_ops arch_timer_irq_ops = { .get_input_level = kvm_arch_timer_get_input_level, }; +static struct irq_ops arch_timer_irq_ops_vgic_v5 = { + .get_input_level = kvm_arch_timer_get_input_level, + .queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock, + .set_direct_injection = vgic_v5_set_ppi_dvi, +}; + static int nr_timers(struct kvm_vcpu *vcpu) { if (!vcpu_has_nv(vcpu)) @@ -447,6 +453,17 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, if (userspace_irqchip(vcpu->kvm)) return; + /* Skip injecting on GICv5 for directly injected (DVI'd) timers */ + if (vgic_is_v5(vcpu->kvm)) { + struct timer_map map; + + get_timer_map(vcpu, &map); + + if (map.direct_ptimer == timer_ctx || + map.direct_vtimer == timer_ctx) + return; + } + kvm_vgic_inject_irq(vcpu->kvm, vcpu, timer_irq(timer_ctx), timer_ctx->irq.level, @@ -674,6 +691,7 @@ static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx)); phys_active |= ctx->irq.level; + phys_active |= vgic_is_v5(vcpu->kvm); set_timer_irq_phys_active(ctx, phys_active); } @@ -740,13 +758,11 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, ret = kvm_vgic_map_phys_irq(vcpu, map->direct_vtimer->host_timer_irq, - timer_irq(map->direct_vtimer), - &arch_timer_irq_ops); + timer_irq(map->direct_vtimer)); WARN_ON_ONCE(ret); ret = kvm_vgic_map_phys_irq(vcpu, map->direct_ptimer->host_timer_irq, - timer_irq(map->direct_ptimer), - &arch_timer_irq_ops); + timer_irq(map->direct_ptimer)); WARN_ON_ONCE(ret); } } @@ -864,7 +880,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) get_timer_map(vcpu, &map); if (static_branch_likely(&has_gic_active_state)) { - if (vcpu_has_nv(vcpu)) + /* We don't do NV on GICv5, yet */ + if (vcpu_has_nv(vcpu) && !vgic_is_v5(vcpu->kvm)) kvm_timer_vcpu_load_nested_switch(vcpu, &map); kvm_timer_vcpu_load_gic(map.direct_vtimer); @@ -934,6 +951,12 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) if (kvm_vcpu_is_blocking(vcpu)) kvm_timer_blocking(vcpu); + + if (vgic_is_v5(vcpu->kvm)) { + set_timer_irq_phys_active(map.direct_vtimer, false); + if (map.direct_ptimer) + set_timer_irq_phys_active(map.direct_ptimer, false); + } } void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) @@ -1097,10 +1120,19 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) HRTIMER_MODE_ABS_HARD); } +/* + * This is always called during kvm_arch_init_vm, but will also be + * called from kvm_vgic_create if we have a vGICv5. + */ void kvm_timer_init_vm(struct kvm *kvm) { + /* + * Set up the default PPIs - note that we adjust them based on + * the model of the GIC as GICv5 uses a different way to + * describing interrupts. + */ for (int i = 0; i < NR_KVM_TIMERS; i++) - kvm->arch.timer_data.ppi[i] = default_ppi[i]; + kvm->arch.timer_data.ppi[i] = get_vgic_ppi(kvm, default_ppi[i]); } void kvm_timer_cpu_up(void) @@ -1269,7 +1301,15 @@ static int timer_irq_set_irqchip_state(struct irq_data *d, static void timer_irq_eoi(struct irq_data *d) { - if (!irqd_is_forwarded_to_vcpu(d)) + /* + * On a GICv5 host, we still need to call EOI on the parent for + * PPIs. The host driver already handles irqs which are forwarded to + * vcpus, and skips the GIC CDDI while still doing the GIC CDEOI. This + * is required to emulate the EOIMode=1 on GICv5 hardware. Failure to + * call EOI unsurprisingly results in *BAD* lock-ups. + */ + if (!irqd_is_forwarded_to_vcpu(d) || + kvm_vgic_global_state.type == VGIC_V5) irq_chip_eoi_parent(d); } @@ -1333,7 +1373,8 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) host_vtimer_irq = info->virtual_irq; kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags); - if (kvm_vgic_global_state.no_hw_deactivation) { + if (kvm_vgic_global_state.no_hw_deactivation || + kvm_vgic_global_state.type == VGIC_V5) { struct fwnode_handle *fwnode; struct irq_data *data; @@ -1351,7 +1392,8 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return -ENOMEM; } - arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE; + if (kvm_vgic_global_state.no_hw_deactivation) + arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE; WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq, (void *)TIMER_VTIMER)); } @@ -1501,11 +1543,18 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) if (kvm_vgic_set_owner(vcpu, irq, ctx)) break; + /* With GICv5, the default PPI is what you get -- nothing else */ + if (vgic_is_v5(vcpu->kvm) && irq != get_vgic_ppi(vcpu->kvm, default_ppi[i])) + break; + /* - * We know by construction that we only have PPIs, so - * all values are less than 32. + * We know by construction that we only have PPIs, so all values + * are less than 32 for non-GICv5 VGICs. On GICv5, they are + * architecturally defined to be under 32 too. However, we mask + * off most of the bits as we might be presented with a GICv5 + * style PPI where the type is encoded in the top-bits. */ - ppis |= BIT(irq); + ppis |= BIT(irq & 0x1f); } valid = hweight32(ppis) == nr_timers(vcpu); @@ -1543,6 +1592,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; + struct irq_ops *ops; int ret; if (timer->enabled) @@ -1563,20 +1613,22 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) get_timer_map(vcpu, &map); + ops = vgic_is_v5(vcpu->kvm) ? &arch_timer_irq_ops_vgic_v5 : + &arch_timer_irq_ops; + + for (int i = 0; i < nr_timers(vcpu); i++) + kvm_vgic_set_irq_ops(vcpu, timer_irq(vcpu_get_timer(vcpu, i)), ops); + ret = kvm_vgic_map_phys_irq(vcpu, map.direct_vtimer->host_timer_irq, - timer_irq(map.direct_vtimer), - &arch_timer_irq_ops); + timer_irq(map.direct_vtimer)); if (ret) return ret; - if (map.direct_ptimer) { + if (map.direct_ptimer) ret = kvm_vgic_map_phys_irq(vcpu, map.direct_ptimer->host_timer_irq, - timer_irq(map.direct_ptimer), - &arch_timer_irq_ops); - } - + timer_irq(map.direct_ptimer)); if (ret) return ret; @@ -1603,15 +1655,14 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (get_user(irq, uaddr)) return -EFAULT; - if (!(irq_is_ppi(irq))) + if (!(irq_is_ppi(vcpu->kvm, irq))) return -EINVAL; - mutex_lock(&vcpu->kvm->arch.config_lock); + guard(mutex)(&vcpu->kvm->arch.config_lock); if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags)) { - ret = -EBUSY; - goto out; + return -EBUSY; } switch (attr->attr) { @@ -1628,8 +1679,7 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) idx = TIMER_HPTIMER; break; default: - ret = -ENXIO; - goto out; + return -ENXIO; } /* @@ -1639,8 +1689,6 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) */ vcpu->kvm->arch.timer_data.ppi[idx] = irq; -out: - mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 410ffd41fd73..176cbe8baad3 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -24,6 +24,7 @@ #define CREATE_TRACE_POINTS #include "trace_arm.h" +#include "hyp_trace.h" #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +47,9 @@ #include #include #include +#include + +#include #include "sys_regs.h" @@ -203,6 +208,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; + if (type & ~KVM_VM_TYPE_ARM_MASK) + return -EINVAL; + mutex_init(&kvm->arch.config_lock); #ifdef CONFIG_LOCKDEP @@ -234,9 +242,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) * If any failures occur after this is successful, make sure to * call __pkvm_unreserve_vm to unreserve the VM in hyp. */ - ret = pkvm_init_host_vm(kvm); + ret = pkvm_init_host_vm(kvm, type); if (ret) - goto err_free_cpumask; + goto err_uninit_mmu; + } else if (type & KVM_VM_TYPE_ARM_PROTECTED) { + ret = -EINVAL; + goto err_uninit_mmu; } kvm_vgic_early_init(kvm); @@ -252,6 +263,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; +err_uninit_mmu: + kvm_uninit_stage2_mmu(kvm); err_free_cpumask: free_cpumask_var(kvm->arch.supported_cpus); err_unshare_kvm: @@ -301,6 +314,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (is_protected_kvm_enabled()) pkvm_destroy_hyp_vm(kvm); + kvm_uninit_stage2_mmu(kvm); kvm_destroy_mpidr_data(kvm); kfree(kvm->arch.sysreg_masks); @@ -613,6 +627,9 @@ static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu) if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) return kvm_wfi_trap_policy == KVM_WFX_NOTRAP; + if (vgic_is_v5(vcpu->kvm)) + return single_task_running(); + return single_task_running() && vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 && (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || @@ -705,6 +722,8 @@ nommu: if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); + + vcpu->arch.pid = pid_nr(vcpu->pid); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -934,6 +953,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } + ret = vgic_v5_finalize_ppi_state(kvm); + if (ret) + return ret; + if (is_protected_kvm_enabled()) { ret = pkvm_create_hyp_vm(kvm); if (ret) @@ -1439,10 +1462,11 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status) { - u32 irq = irq_level->irq; unsigned int irq_type, vcpu_id, irq_num; struct kvm_vcpu *vcpu = NULL; bool level = irq_level->level; + u32 irq = irq_level->irq; + unsigned long *mask; irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; @@ -1472,16 +1496,37 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, if (!vcpu) return -EINVAL; - if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) + if (vgic_is_v5(kvm)) { + if (irq_num >= VGIC_V5_NR_PRIVATE_IRQS) + return -EINVAL; + + /* + * Only allow PPIs that are explicitly exposed to + * usespace to be driven via KVM_IRQ_LINE + */ + mask = kvm->arch.vgic.gicv5_vm.userspace_ppis; + if (!test_bit(irq_num, mask)) + return -EINVAL; + + /* Build a GICv5-style IntID here */ + irq_num = vgic_v5_make_ppi(irq_num); + } else if (irq_num < VGIC_NR_SGIS || + irq_num >= VGIC_NR_PRIVATE_IRQS) { return -EINVAL; + } return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL); case KVM_ARM_IRQ_TYPE_SPI: if (!irqchip_in_kernel(kvm)) return -ENXIO; - if (irq_num < VGIC_NR_PRIVATE_IRQS) - return -EINVAL; + if (vgic_is_v5(kvm)) { + /* Build a GICv5-style IntID here */ + irq_num = vgic_v5_make_spi(irq_num); + } else { + if (irq_num < VGIC_NR_PRIVATE_IRQS) + return -EINVAL; + } return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL); } @@ -2414,6 +2459,10 @@ static int __init init_subsystems(void) kvm_register_perf_callbacks(); + err = kvm_hyp_trace_init(); + if (err) + kvm_err("Failed to initialize Hyp tracing\n"); + out: if (err) hyp_cpu_pm_exit(); @@ -2465,7 +2514,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits) preempt_disable(); cpu_hyp_init_context(); ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, - num_possible_cpus(), kern_hyp_va(per_cpu_base), + kern_hyp_va(per_cpu_base), hyp_va_bits); cpu_hyp_init_features(); @@ -2507,6 +2556,7 @@ static void kvm_hyp_init_symbols(void) { kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + kvm_nvhe_sym(id_aa64pfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR2_EL1); kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); @@ -2529,6 +2579,9 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks; kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks; kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks; + kvm_nvhe_sym(ich_hfgrtr_masks) = ich_hfgrtr_masks; + kvm_nvhe_sym(ich_hfgwtr_masks) = ich_hfgwtr_masks; + kvm_nvhe_sym(ich_hfgitr_masks) = ich_hfgitr_masks; /* * Flush entire BSS since part of its data containing init symbols is read @@ -2674,6 +2727,8 @@ static int __init init_hyp_mode(void) kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; } + kvm_nvhe_sym(hyp_nr_cpus) = num_possible_cpus(); + /* * Map the Hyp-code called directly from the host */ diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c index d9f553cbf9df..f35b8dddd7c1 100644 --- a/arch/arm64/kvm/config.c +++ b/arch/arm64/kvm/config.c @@ -225,6 +225,7 @@ struct reg_feat_map_desc { #define FEAT_MTPMU ID_AA64DFR0_EL1, MTPMU, IMP #define FEAT_HCX ID_AA64MMFR1_EL1, HCX, IMP #define FEAT_S2PIE ID_AA64MMFR3_EL1, S2PIE, IMP +#define FEAT_GCIE ID_AA64PFR2_EL1, GCIE, IMP static bool not_feat_aa64el3(struct kvm *kvm) { @@ -1277,6 +1278,58 @@ static const struct reg_bits_to_feat_map vtcr_el2_feat_map[] = { static const DECLARE_FEAT_MAP(vtcr_el2_desc, VTCR_EL2, vtcr_el2_feat_map, FEAT_AA64EL2); +static const struct reg_bits_to_feat_map ich_hfgrtr_feat_map[] = { + NEEDS_FEAT(ICH_HFGRTR_EL2_ICC_APR_EL1 | + ICH_HFGRTR_EL2_ICC_IDRn_EL1 | + ICH_HFGRTR_EL2_ICC_CR0_EL1 | + ICH_HFGRTR_EL2_ICC_HPPIR_EL1 | + ICH_HFGRTR_EL2_ICC_PCR_EL1 | + ICH_HFGRTR_EL2_ICC_ICSR_EL1 | + ICH_HFGRTR_EL2_ICC_IAFFIDR_EL1 | + ICH_HFGRTR_EL2_ICC_PPI_HMRn_EL1 | + ICH_HFGRTR_EL2_ICC_PPI_ENABLERn_EL1 | + ICH_HFGRTR_EL2_ICC_PPI_PENDRn_EL1 | + ICH_HFGRTR_EL2_ICC_PPI_PRIORITYRn_EL1 | + ICH_HFGRTR_EL2_ICC_PPI_ACTIVERn_EL1, + FEAT_GCIE), +}; + +static const DECLARE_FEAT_MAP_FGT(ich_hfgrtr_desc, ich_hfgrtr_masks, + ich_hfgrtr_feat_map, FEAT_GCIE); + +static const struct reg_bits_to_feat_map ich_hfgwtr_feat_map[] = { + NEEDS_FEAT(ICH_HFGWTR_EL2_ICC_APR_EL1 | + ICH_HFGWTR_EL2_ICC_CR0_EL1 | + ICH_HFGWTR_EL2_ICC_PCR_EL1 | + ICH_HFGWTR_EL2_ICC_ICSR_EL1 | + ICH_HFGWTR_EL2_ICC_PPI_ENABLERn_EL1 | + ICH_HFGWTR_EL2_ICC_PPI_PENDRn_EL1 | + ICH_HFGWTR_EL2_ICC_PPI_PRIORITYRn_EL1 | + ICH_HFGWTR_EL2_ICC_PPI_ACTIVERn_EL1, + FEAT_GCIE), +}; + +static const DECLARE_FEAT_MAP_FGT(ich_hfgwtr_desc, ich_hfgwtr_masks, + ich_hfgwtr_feat_map, FEAT_GCIE); + +static const struct reg_bits_to_feat_map ich_hfgitr_feat_map[] = { + NEEDS_FEAT(ICH_HFGITR_EL2_GICCDEN | + ICH_HFGITR_EL2_GICCDDIS | + ICH_HFGITR_EL2_GICCDPRI | + ICH_HFGITR_EL2_GICCDAFF | + ICH_HFGITR_EL2_GICCDPEND | + ICH_HFGITR_EL2_GICCDRCFG | + ICH_HFGITR_EL2_GICCDHM | + ICH_HFGITR_EL2_GICCDEOI | + ICH_HFGITR_EL2_GICCDDI | + ICH_HFGITR_EL2_GICRCDIA | + ICH_HFGITR_EL2_GICRCDNMIA, + FEAT_GCIE), +}; + +static const DECLARE_FEAT_MAP_FGT(ich_hfgitr_desc, ich_hfgitr_masks, + ich_hfgitr_feat_map, FEAT_GCIE); + static void __init check_feat_map(const struct reg_bits_to_feat_map *map, int map_size, u64 resx, const char *str) { @@ -1328,6 +1381,9 @@ void __init check_feature_map(void) check_reg_desc(&sctlr_el2_desc); check_reg_desc(&mdcr_el2_desc); check_reg_desc(&vtcr_el2_desc); + check_reg_desc(&ich_hfgrtr_desc); + check_reg_desc(&ich_hfgwtr_desc); + check_reg_desc(&ich_hfgitr_desc); } static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map) @@ -1460,6 +1516,13 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt) val |= compute_fgu_bits(kvm, &hdfgrtr2_desc); val |= compute_fgu_bits(kvm, &hdfgwtr2_desc); break; + case ICH_HFGRTR_GROUP: + val |= compute_fgu_bits(kvm, &ich_hfgrtr_desc); + val |= compute_fgu_bits(kvm, &ich_hfgwtr_desc); + break; + case ICH_HFGITR_GROUP: + val |= compute_fgu_bits(kvm, &ich_hfgitr_desc); + break; default: BUG(); } @@ -1531,6 +1594,15 @@ struct resx get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg) case VTCR_EL2: resx = compute_reg_resx_bits(kvm, &vtcr_el2_desc, 0, 0); break; + case ICH_HFGRTR_EL2: + resx = compute_reg_resx_bits(kvm, &ich_hfgrtr_desc, 0, 0); + break; + case ICH_HFGWTR_EL2: + resx = compute_reg_resx_bits(kvm, &ich_hfgwtr_desc, 0, 0); + break; + case ICH_HFGITR_EL2: + resx = compute_reg_resx_bits(kvm, &ich_hfgitr_desc, 0, 0); + break; default: WARN_ON_ONCE(1); resx = (typeof(resx)){}; @@ -1565,6 +1637,12 @@ static __always_inline struct fgt_masks *__fgt_reg_to_masks(enum vcpu_sysreg reg return &hdfgrtr2_masks; case HDFGWTR2_EL2: return &hdfgwtr2_masks; + case ICH_HFGRTR_EL2: + return &ich_hfgrtr_masks; + case ICH_HFGWTR_EL2: + return &ich_hfgwtr_masks; + case ICH_HFGITR_EL2: + return &ich_hfgitr_masks; default: BUILD_BUG_ON(1); } @@ -1585,8 +1663,8 @@ static __always_inline void __compute_fgt(struct kvm_vcpu *vcpu, enum vcpu_sysre clear |= ~nested & m->nmask; } - val |= set; - val &= ~clear; + val |= set | m->res1; + val &= ~(clear | m->res0); *vcpu_fgt(vcpu, reg) = val; } @@ -1606,6 +1684,32 @@ static void __compute_hdfgwtr(struct kvm_vcpu *vcpu) *vcpu_fgt(vcpu, HDFGWTR_EL2) |= HDFGWTR_EL2_MDSCR_EL1; } +static void __compute_ich_hfgrtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, ICH_HFGRTR_EL2); + + /* + * ICC_IAFFIDR_EL1 *always* needs to be trapped when running a guest. + * + * We also trap accesses to ICC_IDR0_EL1 to allow us to completely hide + * FEAT_GCIE_LEGACY from the guest, and to (potentially) present fewer + * ID bits than the host supports. + */ + *vcpu_fgt(vcpu, ICH_HFGRTR_EL2) &= ~(ICH_HFGRTR_EL2_ICC_IAFFIDR_EL1 | + ICH_HFGRTR_EL2_ICC_IDRn_EL1); +} + +static void __compute_ich_hfgwtr(struct kvm_vcpu *vcpu) +{ + __compute_fgt(vcpu, ICH_HFGWTR_EL2); + + /* + * We present a different subset of PPIs the guest from what + * exist in real hardware. We only trap writes, not reads. + */ + *vcpu_fgt(vcpu, ICH_HFGWTR_EL2) &= ~(ICH_HFGWTR_EL2_ICC_PPI_ENABLERn_EL1); +} + void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu) { if (!cpus_have_final_cap(ARM64_HAS_FGT)) @@ -1618,12 +1722,17 @@ void kvm_vcpu_load_fgt(struct kvm_vcpu *vcpu) __compute_hdfgwtr(vcpu); __compute_fgt(vcpu, HAFGRTR_EL2); - if (!cpus_have_final_cap(ARM64_HAS_FGT2)) - return; + if (cpus_have_final_cap(ARM64_HAS_FGT2)) { + __compute_fgt(vcpu, HFGRTR2_EL2); + __compute_fgt(vcpu, HFGWTR2_EL2); + __compute_fgt(vcpu, HFGITR2_EL2); + __compute_fgt(vcpu, HDFGRTR2_EL2); + __compute_fgt(vcpu, HDFGWTR2_EL2); + } - __compute_fgt(vcpu, HFGRTR2_EL2); - __compute_fgt(vcpu, HFGWTR2_EL2); - __compute_fgt(vcpu, HFGITR2_EL2); - __compute_fgt(vcpu, HDFGRTR2_EL2); - __compute_fgt(vcpu, HDFGWTR2_EL2); + if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) { + __compute_ich_hfgrtr(vcpu); + __compute_ich_hfgwtr(vcpu); + __compute_fgt(vcpu, ICH_HFGITR_EL2); + } } diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 22d497554c94..dba7ced74ca5 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2053,6 +2053,60 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1), + + /* + * ICH_HFGRTR_EL2 & ICH_HFGWTR_EL2 + */ + SR_FGT(SYS_ICC_APR_EL1, ICH_HFGRTR, ICC_APR_EL1, 0), + SR_FGT(SYS_ICC_IDR0_EL1, ICH_HFGRTR, ICC_IDRn_EL1, 0), + SR_FGT(SYS_ICC_CR0_EL1, ICH_HFGRTR, ICC_CR0_EL1, 0), + SR_FGT(SYS_ICC_HPPIR_EL1, ICH_HFGRTR, ICC_HPPIR_EL1, 0), + SR_FGT(SYS_ICC_PCR_EL1, ICH_HFGRTR, ICC_PCR_EL1, 0), + SR_FGT(SYS_ICC_ICSR_EL1, ICH_HFGRTR, ICC_ICSR_EL1, 0), + SR_FGT(SYS_ICC_IAFFIDR_EL1, ICH_HFGRTR, ICC_IAFFIDR_EL1, 0), + SR_FGT(SYS_ICC_PPI_HMR0_EL1, ICH_HFGRTR, ICC_PPI_HMRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_HMR1_EL1, ICH_HFGRTR, ICC_PPI_HMRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_ENABLER0_EL1, ICH_HFGRTR, ICC_PPI_ENABLERn_EL1, 0), + SR_FGT(SYS_ICC_PPI_ENABLER1_EL1, ICH_HFGRTR, ICC_PPI_ENABLERn_EL1, 0), + SR_FGT(SYS_ICC_PPI_CPENDR0_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_CPENDR1_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_SPENDR0_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_SPENDR1_EL1, ICH_HFGRTR, ICC_PPI_PENDRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR0_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR1_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR2_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR3_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR4_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR5_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR6_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR7_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR8_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR9_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR10_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR11_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR12_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR13_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR14_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_PRIORITYR15_EL1, ICH_HFGRTR, ICC_PPI_PRIORITYRn_EL1, 0), + SR_FGT(SYS_ICC_PPI_CACTIVER0_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0), + SR_FGT(SYS_ICC_PPI_CACTIVER1_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0), + SR_FGT(SYS_ICC_PPI_SACTIVER0_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0), + SR_FGT(SYS_ICC_PPI_SACTIVER1_EL1, ICH_HFGRTR, ICC_PPI_ACTIVERn_EL1, 0), + + /* + * ICH_HFGITR_EL2 + */ + SR_FGT(GICV5_OP_GIC_CDEN, ICH_HFGITR, GICCDEN, 0), + SR_FGT(GICV5_OP_GIC_CDDIS, ICH_HFGITR, GICCDDIS, 0), + SR_FGT(GICV5_OP_GIC_CDPRI, ICH_HFGITR, GICCDPRI, 0), + SR_FGT(GICV5_OP_GIC_CDAFF, ICH_HFGITR, GICCDAFF, 0), + SR_FGT(GICV5_OP_GIC_CDPEND, ICH_HFGITR, GICCDPEND, 0), + SR_FGT(GICV5_OP_GIC_CDRCFG, ICH_HFGITR, GICCDRCFG, 0), + SR_FGT(GICV5_OP_GIC_CDHM, ICH_HFGITR, GICCDHM, 0), + SR_FGT(GICV5_OP_GIC_CDEOI, ICH_HFGITR, GICCDEOI, 0), + SR_FGT(GICV5_OP_GIC_CDDI, ICH_HFGITR, GICCDDI, 0), + SR_FGT(GICV5_OP_GICR_CDIA, ICH_HFGITR, GICRCDIA, 0), + SR_FGT(GICV5_OP_GICR_CDNMIA, ICH_HFGITR, GICRCDNMIA, 0), }; /* @@ -2127,6 +2181,9 @@ FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2); FGT_MASKS(hfgitr2_masks, HFGITR2_EL2); FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2); FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2); +FGT_MASKS(ich_hfgrtr_masks, ICH_HFGRTR_EL2); +FGT_MASKS(ich_hfgwtr_masks, ICH_HFGWTR_EL2); +FGT_MASKS(ich_hfgitr_masks, ICH_HFGITR_EL2); static __init bool aggregate_fgt(union trap_config tc) { @@ -2162,6 +2219,14 @@ static __init bool aggregate_fgt(union trap_config tc) rmasks = &hfgitr2_masks; wmasks = NULL; break; + case ICH_HFGRTR_GROUP: + rmasks = &ich_hfgrtr_masks; + wmasks = &ich_hfgwtr_masks; + break; + case ICH_HFGITR_GROUP: + rmasks = &ich_hfgitr_masks; + wmasks = NULL; + break; } rresx = rmasks->res0 | rmasks->res1; @@ -2232,6 +2297,9 @@ static __init int check_all_fgt_masks(int ret) &hfgitr2_masks, &hdfgrtr2_masks, &hdfgwtr2_masks, + &ich_hfgrtr_masks, + &ich_hfgwtr_masks, + &ich_hfgitr_masks, }; int err = 0; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index cc7d5d1709cb..54aedf93c78b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -539,7 +539,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, /* All hyp bugs, including warnings, are treated as fatal. */ if (!is_protected_kvm_enabled() || - IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + IS_ENABLED(CONFIG_PKVM_DISABLE_STAGE2_ON_PANIC)) { struct bug_entry *bug = find_bug(elr_in_kimg); if (bug) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 2597e8bda867..ae04fd680d1e 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -233,6 +233,18 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) __activate_fgt(hctxt, vcpu, HDFGWTR2_EL2); } +static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + __activate_fgt(hctxt, vcpu, ICH_HFGRTR_EL2); + __activate_fgt(hctxt, vcpu, ICH_HFGWTR_EL2); + __activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); +} + #define __deactivate_fgt(htcxt, vcpu, reg) \ do { \ write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ @@ -265,6 +277,19 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2); } +static inline void __deactivate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + __deactivate_fgt(hctxt, vcpu, ICH_HFGRTR_EL2); + __deactivate_fgt(hctxt, vcpu, ICH_HFGWTR_EL2); + __deactivate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); + +} + static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) { u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; @@ -328,6 +353,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) } __activate_traps_hfgxtr(vcpu); + __activate_traps_ich_hfgxtr(vcpu); __activate_traps_mpam(vcpu); } @@ -345,6 +371,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_ich_hfgxtr(vcpu); __deactivate_traps_mpam(); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h new file mode 100644 index 000000000000..1258bc84477f --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ +#define __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ + +#include + +#include + +#define hyp_smccc_1_1_smc(...) \ + do { \ + trace_hyp_exit(NULL, HYP_REASON_SMC); \ + arm_smccc_1_1_smc(__VA_ARGS__); \ + trace_hyp_enter(NULL, HYP_REASON_SMC); \ + } while (0) + +#define hyp_smccc_1_2_smc(...) \ + do { \ + trace_hyp_exit(NULL, HYP_REASON_SMC); \ + arm_smccc_1_2_smc(__VA_ARGS__); \ + trace_hyp_enter(NULL, HYP_REASON_SMC); \ + } while (0) + +#endif /* __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/clock.h b/arch/arm64/kvm/hyp/include/nvhe/clock.h new file mode 100644 index 000000000000..9f429f5c0664 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/clock.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARM64_KVM_HYP_NVHE_CLOCK_H +#define __ARM64_KVM_HYP_NVHE_CLOCK_H +#include + +#include + +#ifdef CONFIG_NVHE_EL2_TRACING +void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc); +u64 trace_clock(void); +#else +static inline void +trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { } +static inline u64 trace_clock(void) { return 0; } +#endif +#endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/define_events.h b/arch/arm64/kvm/hyp/include/nvhe/define_events.h new file mode 100644 index 000000000000..776d4c6cb702 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/define_events.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef HYP_EVENT +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + struct hyp_event_id hyp_event_id_##__name \ + __section(".hyp.event_ids."#__name) = { \ + .enabled = ATOMIC_INIT(0), \ + } + +#define HYP_EVENT_MULTI_READ +#include +#undef HYP_EVENT_MULTI_READ + +#undef HYP_EVENT diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 5f9d56754e39..3cbfae0e3dda 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -27,18 +27,22 @@ extern struct host_mmu host_mmu; enum pkvm_component_id { PKVM_ID_HOST, PKVM_ID_HYP, - PKVM_ID_FFA, + PKVM_ID_GUEST, }; -extern unsigned long hyp_nr_cpus; - int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); +int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn); +int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn); int __pkvm_host_unshare_hyp(u64 pfn); int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu); +int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu); +int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys); +int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm); int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm); @@ -70,6 +74,8 @@ static __always_inline void __load_host_stage2(void) #ifdef CONFIG_NVHE_EL2_DEBUG void pkvm_ownership_selftest(void *base); +struct pkvm_hyp_vcpu *init_selftest_vm(void *virt); +void teardown_selftest_vm(void); #else static inline void pkvm_ownership_selftest(void *base) { } #endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index dee1a406b0c2..b50712d47f6d 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -30,8 +30,14 @@ enum pkvm_page_state { * struct hyp_page. */ PKVM_NOPAGE = BIT(0) | BIT(1), + + /* + * 'Meta-states' which aren't encoded directly in the PTE's SW bits (or + * the hyp_vmemmap entry for the host) + */ + PKVM_POISON = BIT(2), }; -#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1)) +#define PKVM_PAGE_STATE_VMEMMAP_MASK (BIT(0) | BIT(1)) #define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, @@ -108,12 +114,12 @@ static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p) { - return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK; + return p->__hyp_state_comp ^ PKVM_PAGE_STATE_VMEMMAP_MASK; } static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state) { - p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK; + p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_VMEMMAP_MASK; } /* diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 184ad7a39950..c904647d2f76 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -73,8 +73,12 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva); int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, unsigned long vcpu_hva); -int __pkvm_teardown_vm(pkvm_handle_t handle); +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn); +int __pkvm_start_teardown_vm(pkvm_handle_t handle); +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle); + +struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); @@ -84,6 +88,7 @@ struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); +bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h new file mode 100644 index 000000000000..8813ff250f8e --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ARM64_KVM_HYP_NVHE_TRACE_H +#define __ARM64_KVM_HYP_NVHE_TRACE_H + +#include + +#include + +static inline pid_t __tracing_get_vcpu_pid(struct kvm_cpu_context *host_ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!host_ctxt) + host_ctxt = host_data_ptr(host_ctxt); + + vcpu = host_ctxt->__hyp_running_vcpu; + + return vcpu ? vcpu->arch.pid : 0; +} + +#define HE_PROTO(__args...) __args +#define HE_ASSIGN(__args...) __args +#define HE_STRUCT RE_STRUCT +#define he_field re_field + +#ifdef CONFIG_NVHE_EL2_TRACING + +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + REMOTE_EVENT_FORMAT(__name, __struct); \ + extern struct hyp_event_id hyp_event_id_##__name; \ + static __always_inline void trace_##__name(__proto) \ + { \ + struct remote_event_format_##__name *__entry; \ + size_t length = sizeof(*__entry); \ + \ + if (!atomic_read(&hyp_event_id_##__name.enabled)) \ + return; \ + __entry = tracing_reserve_entry(length); \ + if (!__entry) \ + return; \ + __entry->hdr.id = hyp_event_id_##__name.id; \ + __assign \ + tracing_commit_entry(); \ + } + +void *tracing_reserve_entry(unsigned long length); +void tracing_commit_entry(void); + +int __tracing_load(unsigned long desc_va, size_t desc_size); +void __tracing_unload(void); +int __tracing_enable(bool enable); +int __tracing_swap_reader(unsigned int cpu); +void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc); +int __tracing_reset(unsigned int cpu); +int __tracing_enable_event(unsigned short id, bool enable); +#else +static inline void *tracing_reserve_entry(unsigned long length) { return NULL; } +static inline void tracing_commit_entry(void) { } +#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \ + static inline void trace_##__name(__proto) {} + +static inline int __tracing_load(unsigned long desc_va, size_t desc_size) { return -ENODEV; } +static inline void __tracing_unload(void) { } +static inline int __tracing_enable(bool enable) { return -ENODEV; } +static inline int __tracing_swap_reader(unsigned int cpu) { return -ENODEV; } +static inline void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { } +static inline int __tracing_reset(unsigned int cpu) { return -ENODEV; } +static inline int __tracing_enable_event(unsigned short id, bool enable) { return -ENODEV; } +#endif +#endif diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index ba5382c12787..32d7b7746e8e 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -16,4 +16,6 @@ __always_unused int ___check_reg_ ## reg; \ type name = (type)cpu_reg(ctxt, (reg)) +void inject_host_exception(u64 esr); + #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index a244ec25f8c5..62cdfbff7562 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -17,7 +17,7 @@ ccflags-y += -fno-stack-protector \ hostprogs := gen-hyprel HOST_EXTRACFLAGS += -I$(objtree)/include -lib-objs := clear_page.o copy_page.o memcpy.o memset.o +lib-objs := clear_page.o copy_page.o memcpy.o memset.o tishift.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) CFLAGS_switch.nvhe.o += -Wno-override-init @@ -26,11 +26,15 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ - ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o + ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o ../vgic-v5-sr.o hyp-obj-y += ../../../kernel/smccc-call.o hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o +hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o trace.o events.o hyp-obj-y += $(lib-objs) +# Path to simple_ring_buffer.c +CFLAGS_trace.nvhe.o += -I$(srctree)/kernel/trace/ + ## ## Build rules for compiling nVHE hyp code ## Output of this folder is `kvm_nvhe.o`, a partially linked object diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c new file mode 100644 index 000000000000..32fc4313fe43 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/clock.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Google LLC + * Author: Vincent Donnefort + */ + +#include + +#include +#include + +static struct clock_data { + struct { + u32 mult; + u32 shift; + u64 epoch_ns; + u64 epoch_cyc; + u64 cyc_overflow64; + } data[2]; + u64 cur; +} trace_clock_data; + +static u64 __clock_mult_uint128(u64 cyc, u32 mult, u32 shift) +{ + __uint128_t ns = (__uint128_t)cyc * mult; + + ns >>= shift; + + return (u64)ns; +} + +/* Does not guarantee no reader on the modified bank. */ +void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) +{ + struct clock_data *clock = &trace_clock_data; + u64 bank = clock->cur ^ 1; + + clock->data[bank].mult = mult; + clock->data[bank].shift = shift; + clock->data[bank].epoch_ns = epoch_ns; + clock->data[bank].epoch_cyc = epoch_cyc; + clock->data[bank].cyc_overflow64 = ULONG_MAX / mult; + + smp_store_release(&clock->cur, bank); +} + +/* Use untrusted host data */ +u64 trace_clock(void) +{ + struct clock_data *clock = &trace_clock_data; + u64 bank = smp_load_acquire(&clock->cur); + u64 cyc, ns; + + cyc = __arch_counter_get_cntvct() - clock->data[bank].epoch_cyc; + + if (likely(cyc < clock->data[bank].cyc_overflow64)) { + ns = cyc * clock->data[bank].mult; + ns >>= clock->data[bank].shift; + } else { + ns = __clock_mult_uint128(cyc, clock->data[bank].mult, + clock->data[bank].shift); + } + + return (u64)ns + clock->data[bank].epoch_ns; +} diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 2a1c0f49792b..f8904391c125 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -14,20 +14,20 @@ #include #include -static void __debug_save_spe(u64 *pmscr_el1) +static void __debug_save_spe(void) { - u64 reg; + u64 *pmscr_el1, *pmblimitr_el1; - /* Clear pmscr in case of early return */ - *pmscr_el1 = 0; + pmscr_el1 = host_data_ptr(host_debug_state.pmscr_el1); + pmblimitr_el1 = host_data_ptr(host_debug_state.pmblimitr_el1); /* * At this point, we know that this CPU implements * SPE and is available to the host. * Check if the host is actually using it ? */ - reg = read_sysreg_s(SYS_PMBLIMITR_EL1); - if (!(reg & BIT(PMBLIMITR_EL1_E_SHIFT))) + *pmblimitr_el1 = read_sysreg_s(SYS_PMBLIMITR_EL1); + if (!(*pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT))) return; /* Yes; save the control register and disable data generation */ @@ -37,18 +37,29 @@ static void __debug_save_spe(u64 *pmscr_el1) /* Now drain all buffered data to memory */ psb_csync(); + dsb(nsh); + + /* And disable the profiling buffer */ + write_sysreg_s(0, SYS_PMBLIMITR_EL1); + isb(); } -static void __debug_restore_spe(u64 pmscr_el1) +static void __debug_restore_spe(void) { - if (!pmscr_el1) + u64 pmblimitr_el1 = *host_data_ptr(host_debug_state.pmblimitr_el1); + + if (!(pmblimitr_el1 & BIT(PMBLIMITR_EL1_E_SHIFT))) return; /* The host page table is installed, but not yet synchronised */ isb(); + /* Re-enable the profiling buffer. */ + write_sysreg_s(pmblimitr_el1, SYS_PMBLIMITR_EL1); + isb(); + /* Re-enable data generation */ - write_sysreg_el1(pmscr_el1, SYS_PMSCR); + write_sysreg_el1(*host_data_ptr(host_debug_state.pmscr_el1), SYS_PMSCR); } static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) @@ -57,12 +68,54 @@ static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) write_sysreg_el1(new_trfcr, SYS_TRFCR); } -static bool __trace_needs_drain(void) +static void __trace_drain_and_disable(void) { - if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) - return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; + u64 *trblimitr_el1 = host_data_ptr(host_debug_state.trblimitr_el1); + bool needs_drain = is_protected_kvm_enabled() ? + host_data_test_flag(HAS_TRBE) : + host_data_test_flag(TRBE_ENABLED); - return host_data_test_flag(TRBE_ENABLED); + if (!needs_drain) { + *trblimitr_el1 = 0; + return; + } + + *trblimitr_el1 = read_sysreg_s(SYS_TRBLIMITR_EL1); + if (*trblimitr_el1 & TRBLIMITR_EL1_E) { + /* + * The host has enabled the Trace Buffer Unit so we have + * to beat the CPU with a stick until it stops accessing + * memory. + */ + + /* First, ensure that our prior write to TRFCR has stuck. */ + isb(); + + /* Now synchronise with the trace and drain the buffer. */ + tsb_csync(); + dsb(nsh); + + /* + * With no more trace being generated, we can disable the + * Trace Buffer Unit. + */ + write_sysreg_s(0, SYS_TRBLIMITR_EL1); + if (cpus_have_final_cap(ARM64_WORKAROUND_2064142)) { + /* + * Some CPUs are so good, we have to drain 'em + * twice. + */ + tsb_csync(); + dsb(nsh); + } + + /* + * Ensure that the Trace Buffer Unit is disabled before + * we start mucking with the stage-2 and trap + * configuration. + */ + isb(); + } } static bool __trace_needs_switch(void) @@ -79,21 +132,34 @@ static void __trace_switch_to_guest(void) __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), *host_data_ptr(trfcr_while_in_guest)); - - if (__trace_needs_drain()) { - isb(); - tsb_csync(); - } + __trace_drain_and_disable(); } static void __trace_switch_to_host(void) { + u64 trblimitr_el1 = *host_data_ptr(host_debug_state.trblimitr_el1); + + if (trblimitr_el1 & TRBLIMITR_EL1_E) { + /* Re-enable the Trace Buffer Unit for the host. */ + write_sysreg_s(trblimitr_el1, SYS_TRBLIMITR_EL1); + isb(); + if (cpus_have_final_cap(ARM64_WORKAROUND_2038923)) { + /* + * Make sure the unit is re-enabled before we + * poke TRFCR. + */ + isb(); + } + } + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), *host_data_ptr(host_debug_state.trfcr_el1)); } -static void __debug_save_brbe(u64 *brbcr_el1) +static void __debug_save_brbe(void) { + u64 *brbcr_el1 = host_data_ptr(host_debug_state.brbcr_el1); + *brbcr_el1 = 0; /* Check if the BRBE is enabled */ @@ -109,8 +175,10 @@ static void __debug_save_brbe(u64 *brbcr_el1) write_sysreg_el1(0, SYS_BRBCR); } -static void __debug_restore_brbe(u64 brbcr_el1) +static void __debug_restore_brbe(void) { + u64 brbcr_el1 = *host_data_ptr(host_debug_state.brbcr_el1); + if (!brbcr_el1) return; @@ -122,11 +190,11 @@ void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ if (host_data_test_flag(HAS_SPE)) - __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); + __debug_save_spe(); /* Disable BRBE branch records */ if (host_data_test_flag(HAS_BRBE)) - __debug_save_brbe(host_data_ptr(host_debug_state.brbcr_el1)); + __debug_save_brbe(); if (__trace_needs_switch()) __trace_switch_to_guest(); @@ -140,9 +208,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { if (host_data_test_flag(HAS_SPE)) - __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); + __debug_restore_spe(); if (host_data_test_flag(HAS_BRBE)) - __debug_restore_brbe(*host_data_ptr(host_debug_state.brbcr_el1)); + __debug_restore_brbe(); if (__trace_needs_switch()) __trace_switch_to_host(); } diff --git a/arch/arm64/kvm/hyp/nvhe/events.c b/arch/arm64/kvm/hyp/nvhe/events.c new file mode 100644 index 000000000000..add9383aadb5 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/events.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Vincent Donnefort + */ + +#include +#include + +#include + +int __tracing_enable_event(unsigned short id, bool enable) +{ + struct hyp_event_id *event_id = &__hyp_event_ids_start[id]; + atomic_t *enabled; + + if (event_id >= __hyp_event_ids_end) + return -EINVAL; + + enabled = hyp_fixmap_map(__hyp_pa(&event_id->enabled)); + atomic_set(enabled, enable); + hyp_fixmap_unmap(); + + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 94161ea1cd60..1af722771178 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -26,10 +26,10 @@ * the duration and are therefore serialised. */ -#include #include #include +#include #include #include #include @@ -147,7 +147,7 @@ static int ffa_map_hyp_buffers(u64 ffa_page_count) { struct arm_smccc_1_2_regs res; - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_FN64_RXTX_MAP, .a1 = hyp_virt_to_phys(hyp_buffers.tx), .a2 = hyp_virt_to_phys(hyp_buffers.rx), @@ -161,7 +161,7 @@ static int ffa_unmap_hyp_buffers(void) { struct arm_smccc_1_2_regs res; - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_RXTX_UNMAP, .a1 = HOST_FFA_ID, }, &res); @@ -172,7 +172,7 @@ static int ffa_unmap_hyp_buffers(void) static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fraglen, u32 endpoint_id) { - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_MEM_FRAG_TX, .a1 = handle_lo, .a2 = handle_hi, @@ -184,7 +184,7 @@ static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo, static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 fragoff) { - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_MEM_FRAG_RX, .a1 = handle_lo, .a2 = handle_hi, @@ -196,7 +196,7 @@ static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo, static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len, u32 fraglen) { - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = func_id, .a1 = len, .a2 = fraglen, @@ -206,7 +206,7 @@ static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len, static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo, u32 handle_hi, u32 flags) { - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_MEM_RECLAIM, .a1 = handle_lo, .a2 = handle_hi, @@ -216,7 +216,7 @@ static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo, static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len) { - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_FN64_MEM_RETRIEVE_REQ, .a1 = len, .a2 = len, @@ -225,7 +225,7 @@ static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len) static void ffa_rx_release(struct arm_smccc_1_2_regs *res) { - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_RX_RELEASE, }, res); } @@ -728,7 +728,7 @@ static int hyp_ffa_post_init(void) size_t min_rxtx_sz; struct arm_smccc_1_2_regs res; - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ .a0 = FFA_ID_GET, }, &res); if (res.a0 != FFA_SUCCESS) @@ -737,7 +737,7 @@ static int hyp_ffa_post_init(void) if (res.a2 != HOST_FFA_ID) return -EINVAL; - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){ .a0 = FFA_FEATURES, .a1 = FFA_FN64_RXTX_MAP, }, &res); @@ -788,7 +788,7 @@ static void do_ffa_version(struct arm_smccc_1_2_regs *res, * first if TEE supports it. */ if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) { - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_VERSION, .a1 = ffa_req_version, }, res); @@ -824,7 +824,7 @@ static void do_ffa_part_get(struct arm_smccc_1_2_regs *res, goto out_unlock; } - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_PARTITION_INFO_GET, .a1 = uuid0, .a2 = uuid1, @@ -939,7 +939,7 @@ int hyp_ffa_init(void *pages) if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) return 0; - arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { + hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) { .a0 = FFA_VERSION, .a1 = FFA_VERSION_1_2, }, &res); diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index eef15b374abb..f337770ec459 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -120,12 +120,11 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 -#ifdef CONFIG_NVHE_EL2_DEBUG +#ifdef PKVM_DISABLE_STAGE2_ON_PANIC /* Ensure host stage-2 is disabled */ mrs x0, hcr_el2 bic x0, x0, #HCR_VM msr_hcr_el2 x0 - isb tlbi vmalls12e1 dsb nsh #endif @@ -291,13 +290,3 @@ SYM_CODE_START(__kvm_hyp_host_forward_smc) ret SYM_CODE_END(__kvm_hyp_host_forward_smc) - -/* - * kvm_host_psci_cpu_entry is called through br instruction, which requires - * bti j instruction as compilers (gcc and llvm) doesn't insert bti j for external - * functions, but bti c instead. - */ -SYM_CODE_START(kvm_host_psci_cpu_entry) - bti j - b __kvm_host_psci_cpu_entry -SYM_CODE_END(kvm_host_psci_cpu_entry) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 0d42eedc7167..64296b31da73 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -173,9 +173,8 @@ SYM_CODE_END(___kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START(kvm_hyp_cpu_entry) - mov x1, #1 // is_cpu_on = true + ldr x29, =__kvm_host_psci_cpu_on_entry b __kvm_hyp_init_cpu -SYM_CODE_END(kvm_hyp_cpu_entry) /* * PSCI CPU_SUSPEND / SYSTEM_SUSPEND entry point @@ -183,32 +182,17 @@ SYM_CODE_END(kvm_hyp_cpu_entry) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START(kvm_hyp_cpu_resume) - mov x1, #0 // is_cpu_on = false - b __kvm_hyp_init_cpu -SYM_CODE_END(kvm_hyp_cpu_resume) + ldr x29, =__kvm_host_psci_cpu_resume_entry -/* - * Common code for CPU entry points. Initializes EL2 state and - * installs the hypervisor before handing over to a C handler. - * - * x0: struct kvm_nvhe_init_params PA - * x1: bool is_cpu_on - */ -SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) +SYM_INNER_LABEL(__kvm_hyp_init_cpu, SYM_L_LOCAL) mov x28, x0 // Stash arguments - mov x29, x1 /* Check that the core was booted in EL2. */ mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 - b.eq 2f + b.ne 1f - /* The core booted in EL1. KVM cannot be initialized on it. */ -1: wfe - wfi - b 1b - -2: msr SPsel, #1 // We want to use SP_EL{1,2} + msr SPsel, #1 // We want to use SP_EL2 init_el2_hcr 0 @@ -218,11 +202,16 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) mov x0, x28 bl ___kvm_hyp_init // Clobbers x0..x2 - /* Leave idmap. */ - mov x0, x29 - ldr x1, =kvm_host_psci_cpu_entry - br x1 -SYM_CODE_END(__kvm_hyp_init_cpu) + /* Leave idmap -- using BLR is OK, LR is restored from host context */ + blr x29 + + // The core booted in EL1, or the C code unexpectedly returned. + // Either way, KVM cannot be initialized on it. +1: wfe + wfi + b 1b +SYM_CODE_END(kvm_hyp_cpu_resume) +SYM_CODE_END(kvm_hyp_cpu_entry) SYM_CODE_START(__kvm_handle_stub_hvc) /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index e7790097db93..73f2e0221e70 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); @@ -136,6 +138,8 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; + + hyp_vcpu->vcpu.arch.pid = host_vcpu->arch.pid; } static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) @@ -169,9 +173,6 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) DECLARE_REG(u64, hcr_el2, host_ctxt, 3); struct pkvm_hyp_vcpu *hyp_vcpu; - if (!is_protected_kvm_enabled()) - return; - hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); if (!hyp_vcpu) return; @@ -188,12 +189,8 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) { - struct pkvm_hyp_vcpu *hyp_vcpu; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); - if (!is_protected_kvm_enabled()) - return; - - hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (hyp_vcpu) pkvm_put_hyp_vcpu(hyp_vcpu); } @@ -248,6 +245,26 @@ static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) &host_vcpu->arch.pkvm_memcache); } +static void handle___pkvm_host_donate_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || !pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + ret = __pkvm_host_donate_guest(pfn, gfn, hyp_vcpu); +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(u64, pfn, host_ctxt, 1); @@ -257,9 +274,6 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) struct pkvm_hyp_vcpu *hyp_vcpu; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) goto out; @@ -281,9 +295,6 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) goto out; @@ -301,9 +312,6 @@ static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ct struct pkvm_hyp_vcpu *hyp_vcpu; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) goto out; @@ -321,9 +329,6 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) goto out; @@ -343,9 +348,6 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho struct pkvm_hyp_vm *hyp_vm; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) goto out; @@ -362,9 +364,6 @@ static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) struct pkvm_hyp_vcpu *hyp_vcpu; int ret = -EINVAL; - if (!is_protected_kvm_enabled()) - goto out; - hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) goto out; @@ -424,12 +423,8 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); - struct pkvm_hyp_vm *hyp_vm; + struct pkvm_hyp_vm *hyp_vm = get_np_pkvm_hyp_vm(handle); - if (!is_protected_kvm_enabled()) - return; - - hyp_vm = get_np_pkvm_hyp_vm(handle); if (!hyp_vm) return; @@ -486,17 +481,15 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); DECLARE_REG(unsigned long, size, host_ctxt, 2); - DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3); - DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4); - DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5); + DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 3); + DECLARE_REG(u32, hyp_va_bits, host_ctxt, 4); /* * __pkvm_init() will return only if an error occurred, otherwise it * will tail-call in __pkvm_init_finalise() which will have to deal * with the host context directly. */ - cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base, - hyp_va_bits); + cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, per_cpu_base, hyp_va_bits); } static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) @@ -582,11 +575,115 @@ static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva); } -static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_vcpu_in_poison_fault(struct kvm_cpu_context *host_ctxt) +{ + int ret; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + + ret = hyp_vcpu ? __pkvm_vcpu_in_poison_fault(hyp_vcpu) : -EINVAL; + cpu_reg(host_ctxt, 1) = ret; +} + +static void handle___pkvm_force_reclaim_guest_page(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_host_force_reclaim_page_guest(phys); +} + +static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, gfn); +} + +static void handle___pkvm_start_teardown_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); - cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle); + cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle); +} + +static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle); +} + +static void handle___tracing_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1); + DECLARE_REG(size_t, desc_size, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size); +} + +static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt) +{ + __tracing_unload(); +} + +static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(bool, enable, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_enable(enable); +} + +static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu); +} + +static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u32, mult, host_ctxt, 1); + DECLARE_REG(u32, shift, host_ctxt, 2); + DECLARE_REG(u64, epoch_ns, host_ctxt, 3); + DECLARE_REG(u64, epoch_cyc, host_ctxt, 4); + + __tracing_update_clock(mult, shift, epoch_ns, epoch_cyc); +} + +static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned int, cpu, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __tracing_reset(cpu); +} + +static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned short, id, host_ctxt, 1); + DECLARE_REG(bool, enable, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable); +} + +static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, id, host_ctxt, 1); + + trace_selftest(id); +} + +static void handle___vgic_v5_save_apr(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v5_save_apr(kern_hyp_va(cpu_if)); +} + +static void handle___vgic_v5_restore_vmcr_apr(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct vgic_v5_cpu_if *, cpu_if, host_ctxt, 1); + + __vgic_v5_restore_vmcr_apr(kern_hyp_va(cpu_if)); } typedef void (*hcall_t)(struct kvm_cpu_context *); @@ -603,14 +700,6 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__pkvm_prot_finalize), - HANDLE_FUNC(__pkvm_host_share_hyp), - HANDLE_FUNC(__pkvm_host_unshare_hyp), - HANDLE_FUNC(__pkvm_host_share_guest), - HANDLE_FUNC(__pkvm_host_unshare_guest), - HANDLE_FUNC(__pkvm_host_relax_perms_guest), - HANDLE_FUNC(__pkvm_host_wrprotect_guest), - HANDLE_FUNC(__pkvm_host_test_clear_young_guest), - HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), @@ -622,20 +711,44 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), + HANDLE_FUNC(__vgic_v5_save_apr), + HANDLE_FUNC(__vgic_v5_restore_vmcr_apr), + + HANDLE_FUNC(__pkvm_host_share_hyp), + HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_donate_guest), + HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__pkvm_reserve_vm), HANDLE_FUNC(__pkvm_unreserve_vm), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), - HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_in_poison_fault), + HANDLE_FUNC(__pkvm_force_reclaim_guest_page), + HANDLE_FUNC(__pkvm_reclaim_dying_guest_page), + HANDLE_FUNC(__pkvm_start_teardown_vm), + HANDLE_FUNC(__pkvm_finalize_teardown_vm), HANDLE_FUNC(__pkvm_vcpu_load), HANDLE_FUNC(__pkvm_vcpu_put), HANDLE_FUNC(__pkvm_tlb_flush_vmid), + HANDLE_FUNC(__tracing_load), + HANDLE_FUNC(__tracing_unload), + HANDLE_FUNC(__tracing_enable), + HANDLE_FUNC(__tracing_swap_reader), + HANDLE_FUNC(__tracing_update_clock), + HANDLE_FUNC(__tracing_reset), + HANDLE_FUNC(__tracing_enable_event), + HANDLE_FUNC(__tracing_write_event), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(unsigned long, id, host_ctxt, 0); - unsigned long hcall_min = 0; + unsigned long hcall_min = 0, hcall_max = -1; hcall_t hfn; /* @@ -647,14 +760,19 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) * basis. This is all fine, however, since __pkvm_prot_finalize * returns -EPERM after the first call for a given CPU. */ - if (static_branch_unlikely(&kvm_protected_mode_initialized)) - hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + if (static_branch_unlikely(&kvm_protected_mode_initialized)) { + hcall_min = __KVM_HOST_SMCCC_FUNC_MIN_PKVM; + } else { + hcall_max = __KVM_HOST_SMCCC_FUNC_MAX_NO_PKVM; + } id &= ~ARM_SMCCC_CALL_HINTS; id -= KVM_HOST_SMCCC_ID(0); - if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) + if (unlikely(id < hcall_min || id > hcall_max || + id >= ARRAY_SIZE(host_hcall))) { goto inval; + } hfn = host_hcall[id]; if (unlikely(!hfn)) @@ -670,14 +788,22 @@ inval: static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) { + trace_hyp_exit(host_ctxt, HYP_REASON_SMC); __kvm_hyp_host_forward_smc(host_ctxt); + trace_hyp_enter(host_ctxt, HYP_REASON_SMC); } static void handle_host_smc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(u64, func_id, host_ctxt, 0); + u64 esr = read_sysreg_el2(SYS_ESR); bool handled; + if (esr & ESR_ELx_xVC_IMM_MASK) { + cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED; + goto exit_skip_instr; + } + func_id &= ~ARM_SMCCC_CALL_HINTS; handled = kvm_host_psci_handler(host_ctxt, func_id); @@ -686,47 +812,57 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt) if (!handled) default_host_smc_handler(host_ctxt); +exit_skip_instr: /* SMC was trapped, move ELR past the current PC. */ kvm_skip_host_instr(); } -/* - * Inject an Undefined Instruction exception into the host. - * - * This is open-coded to allow control over PSTATE construction without - * complicating the generic exception entry helpers. - */ -static void inject_undef64(void) +void inject_host_exception(u64 esr) { - u64 spsr_mask, vbar, sctlr, old_spsr, new_spsr, esr, offset; + u64 sctlr, spsr_el1, spsr_el2, exc_offset = except_type_sync; + const u64 spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | + PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT; - spsr_mask = PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | PSR_DIT_BIT | PSR_PAN_BIT; + spsr_el1 = spsr_el2 = read_sysreg_el2(SYS_SPSR); + switch (spsr_el1 & (PSR_MODE_MASK | PSR_MODE32_BIT)) { + case PSR_MODE_EL0t: + exc_offset += LOWER_EL_AArch64_VECTOR; + break; + case PSR_MODE_EL0t | PSR_MODE32_BIT: + exc_offset += LOWER_EL_AArch32_VECTOR; + break; + default: + exc_offset += CURRENT_EL_SP_ELx_VECTOR; + } + + spsr_el2 &= spsr_mask; + spsr_el2 |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | + PSR_MODE_EL1h; - vbar = read_sysreg_el1(SYS_VBAR); sctlr = read_sysreg_el1(SYS_SCTLR); - old_spsr = read_sysreg_el2(SYS_SPSR); - - new_spsr = old_spsr & spsr_mask; - new_spsr |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT; - new_spsr |= PSR_MODE_EL1h; - if (!(sctlr & SCTLR_EL1_SPAN)) - new_spsr |= PSR_PAN_BIT; + spsr_el2 |= PSR_PAN_BIT; if (sctlr & SCTLR_ELx_DSSBS) - new_spsr |= PSR_SSBS_BIT; + spsr_el2 |= PSR_SSBS_BIT; if (system_supports_mte()) - new_spsr |= PSR_TCO_BIT; + spsr_el2 |= PSR_TCO_BIT; - esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) | ESR_ELx_IL; - offset = CURRENT_EL_SP_ELx_VECTOR + except_type_sync; + if (esr_fsc_is_translation_fault(esr)) + write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR); write_sysreg_el1(esr, SYS_ESR); write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); - write_sysreg_el1(old_spsr, SYS_SPSR); - write_sysreg_el2(vbar + offset, SYS_ELR); - write_sysreg_el2(new_spsr, SYS_SPSR); + write_sysreg_el1(spsr_el1, SYS_SPSR); + write_sysreg_el2(read_sysreg_el1(SYS_VBAR) + exc_offset, SYS_ELR); + write_sysreg_el2(spsr_el2, SYS_SPSR); +} + +static void inject_host_undef64(void) +{ + inject_host_exception((ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT) | + ESR_ELx_IL); } static bool handle_host_mte(u64 esr) @@ -749,7 +885,7 @@ static bool handle_host_mte(u64 esr) return false; } - inject_undef64(); + inject_host_undef64(); return true; } @@ -757,15 +893,19 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); + switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_HVC64: + trace_hyp_enter(host_ctxt, HYP_REASON_HVC); handle_host_hcall(host_ctxt); break; case ESR_ELx_EC_SMC64: + trace_hyp_enter(host_ctxt, HYP_REASON_SMC); handle_host_smc(host_ctxt); break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: + trace_hyp_enter(host_ctxt, HYP_REASON_HOST_ABORT); handle_host_mem_abort(host_ctxt); break; case ESR_ELx_EC_SYS64: @@ -775,4 +915,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) default: BUG(); } + + trace_hyp_exit(host_ctxt, HYP_REASON_ERET_HOST); } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index d724f6d69302..7a02837203d1 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -16,6 +16,12 @@ SECTIONS { HYP_SECTION(.text) HYP_SECTION(.data..ro_after_init) HYP_SECTION(.rodata) +#ifdef CONFIG_NVHE_EL2_TRACING + . = ALIGN(PAGE_SIZE); + BEGIN_HYP_SECTION(.event_ids) + *(SORT(.hyp.event_ids.*)) + END_HYP_SECTION +#endif /* * .hyp..data..percpu needs to be page aligned to maintain the same diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index d815265bd374..28a471d1927c 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -18,6 +18,7 @@ #include #include #include +#include #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_AS_S1 | KVM_PGTABLE_S2_IDMAP) @@ -461,8 +462,15 @@ static bool range_is_memory(u64 start, u64 end) static inline int __host_stage2_idmap(u64 start, u64 end, enum kvm_pgtable_prot prot) { + /* + * We don't make permission changes to the host idmap after + * initialisation, so we can squash -EAGAIN to save callers + * having to treat it like success in the case that they try to + * map something that is already mapped. + */ return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start, - prot, &host_s2_pool, 0); + prot, &host_s2_pool, + KVM_PGTABLE_WALK_IGNORE_EAGAIN); } /* @@ -504,7 +512,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) return ret; if (kvm_pte_valid(pte)) - return -EAGAIN; + return -EEXIST; if (pte) { WARN_ON(addr_is_memory(addr) && @@ -541,24 +549,99 @@ static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_ set_host_state(page, state); } -int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +#define KVM_HOST_DONATION_PTE_OWNER_MASK GENMASK(3, 1) +#define KVM_HOST_DONATION_PTE_EXTRA_MASK GENMASK(59, 4) +static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size, + u8 owner_id, u64 meta) { + kvm_pte_t annotation; int ret; + if (owner_id == PKVM_ID_HOST) + return -EINVAL; + if (!range_is_memory(addr, addr + size)) return -EPERM; - ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, - addr, size, &host_s2_pool, owner_id); - if (ret) - return ret; + if (!FIELD_FIT(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id)) + return -EINVAL; - /* Don't forget to update the vmemmap tracking for the host */ - if (owner_id == PKVM_ID_HOST) - __host_update_page_state(addr, size, PKVM_PAGE_OWNED); - else + if (!FIELD_FIT(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta)) + return -EINVAL; + + annotation = FIELD_PREP(KVM_HOST_DONATION_PTE_OWNER_MASK, owner_id) | + FIELD_PREP(KVM_HOST_DONATION_PTE_EXTRA_MASK, meta); + ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt, + addr, size, &host_s2_pool, + KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation); + if (!ret) __host_update_page_state(addr, size, PKVM_NOPAGE); + return ret; +} + +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +{ + int ret = -EINVAL; + + switch (owner_id) { + case PKVM_ID_HOST: + if (!range_is_memory(addr, addr + size)) + return -EPERM; + + ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); + if (!ret) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + break; + case PKVM_ID_HYP: + ret = host_stage2_set_owner_metadata_locked(addr, size, + owner_id, 0); + break; + } + + return ret; +} + +#define KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK GENMASK(15, 0) +/* We need 40 bits for the GFN to cover a 52-bit IPA with 4k pages and LPA2 */ +#define KVM_HOST_PTE_OWNER_GUEST_GFN_MASK GENMASK(55, 16) +static u64 host_stage2_encode_gfn_meta(struct pkvm_hyp_vm *vm, u64 gfn) +{ + pkvm_handle_t handle = vm->kvm.arch.pkvm.handle; + + BUILD_BUG_ON((pkvm_handle_t)-1 > KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK); + WARN_ON(!FIELD_FIT(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn)); + + return FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, handle) | + FIELD_PREP(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, gfn); +} + +static int host_stage2_decode_gfn_meta(kvm_pte_t pte, struct pkvm_hyp_vm **vm, + u64 *gfn) +{ + pkvm_handle_t handle; + u64 meta; + + if (WARN_ON(kvm_pte_valid(pte))) + return -EINVAL; + + if (FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) != + KVM_HOST_INVALID_PTE_TYPE_DONATION) { + return -EINVAL; + } + + if (FIELD_GET(KVM_HOST_DONATION_PTE_OWNER_MASK, pte) != PKVM_ID_GUEST) + return -EPERM; + + meta = FIELD_GET(KVM_HOST_DONATION_PTE_EXTRA_MASK, pte); + handle = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_HANDLE_MASK, meta); + *vm = get_vm_by_handle(handle); + if (!*vm) { + /* We probably raced with teardown; try again */ + return -EAGAIN; + } + + *gfn = FIELD_GET(KVM_HOST_PTE_OWNER_GUEST_GFN_MASK, meta); return 0; } @@ -605,11 +688,43 @@ unlock: return ret; } +static void host_inject_mem_abort(struct kvm_cpu_context *host_ctxt) +{ + u64 ec, esr, spsr; + + esr = read_sysreg_el2(SYS_ESR); + spsr = read_sysreg_el2(SYS_SPSR); + + /* Repaint the ESR to report a same-level fault if taken from EL1 */ + if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) { + ec = ESR_ELx_EC(esr); + if (ec == ESR_ELx_EC_DABT_LOW) + ec = ESR_ELx_EC_DABT_CUR; + else if (ec == ESR_ELx_EC_IABT_LOW) + ec = ESR_ELx_EC_IABT_CUR; + else + WARN_ON(1); + esr &= ~ESR_ELx_EC_MASK; + esr |= ec << ESR_ELx_EC_SHIFT; + } + + /* + * Since S1PTW should only ever be set for stage-2 faults, we're pretty + * much guaranteed that it won't be set in ESR_EL1 by the hardware. So, + * let's use that bit to allow the host abort handler to differentiate + * this abort from normal userspace faults. + * + * Note: although S1PTW is RES0 at EL1, it is guaranteed by the + * architecture to be backed by flops, so it should be safe to use. + */ + esr |= ESR_ELx_S1PTW; + inject_host_exception(esr); +} + void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) { struct kvm_vcpu_fault_info fault; u64 esr, addr; - int ret = 0; esr = read_sysreg_el2(SYS_ESR); if (!__get_fault_info(esr, &fault)) { @@ -628,8 +743,16 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) BUG_ON(!(fault.hpfar_el2 & HPFAR_EL2_NS)); addr = FIELD_GET(HPFAR_EL2_FIPA, fault.hpfar_el2) << 12; - ret = host_stage2_idmap(addr); - BUG_ON(ret && ret != -EAGAIN); + switch (host_stage2_idmap(addr)) { + case -EPERM: + host_inject_mem_abort(host_ctxt); + fallthrough; + case -EEXIST: + case 0: + break; + default: + BUG(); + } } struct check_walk_data { @@ -707,8 +830,20 @@ static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_pa return 0; } +static bool guest_pte_is_poisoned(kvm_pte_t pte) +{ + if (kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) == + KVM_GUEST_INVALID_PTE_TYPE_POISONED; +} + static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) { + if (guest_pte_is_poisoned(pte)) + return PKVM_POISON; + if (!kvm_pte_valid(pte)) return PKVM_NOPAGE; @@ -727,6 +862,77 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr, return check_page_state_range(&vm->pgt, addr, size, &d); } +static int get_valid_guest_pte(struct pkvm_hyp_vm *vm, u64 ipa, kvm_pte_t *ptep, u64 *physp) +{ + kvm_pte_t pte; + u64 phys; + s8 level; + int ret; + + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + return ret; + if (guest_pte_is_poisoned(pte)) + return -EHWPOISON; + if (!kvm_pte_valid(pte)) + return -ENOENT; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; + + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + if (WARN_ON(ret)) + return ret; + + *ptep = pte; + *physp = phys; + + return 0; +} + +int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + kvm_pte_t pte; + s8 level; + u64 ipa; + int ret; + + switch (kvm_vcpu_trap_get_class(&hyp_vcpu->vcpu)) { + case ESR_ELx_EC_DABT_LOW: + case ESR_ELx_EC_IABT_LOW: + if (kvm_vcpu_trap_is_translation_fault(&hyp_vcpu->vcpu)) + break; + fallthrough; + default: + return -EINVAL; + } + + /* + * The host has the faulting IPA when it calls us from the guest + * fault handler but we retrieve it ourselves from the FAR so as + * to avoid exposing an "oracle" that could reveal data access + * patterns of the guest after initial donation of its pages. + */ + ipa = kvm_vcpu_get_fault_ipa(&hyp_vcpu->vcpu); + ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(&hyp_vcpu->vcpu)); + + guest_lock_component(vm); + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + goto unlock; + + if (level != KVM_PGTABLE_LAST_LEVEL) { + ret = -EINVAL; + goto unlock; + } + + ret = guest_pte_is_poisoned(pte); +unlock: + guest_unlock_component(vm); + return ret; +} + int __pkvm_host_share_hyp(u64 pfn) { u64 phys = hyp_pfn_to_phys(pfn); @@ -753,6 +959,72 @@ unlock: return ret; } +int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys, ipa = hyp_pfn_to_phys(gfn); + kvm_pte_t pte; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = get_valid_guest_pte(vm, ipa, &pte, &phys); + if (ret) + goto unlock; + + ret = -EPERM; + if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_OWNED) + goto unlock; + if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE)) + goto unlock; + + ret = 0; + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_SHARED_OWNED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)); +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + +int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 gfn) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 meta, phys, ipa = hyp_pfn_to_phys(gfn); + kvm_pte_t pte; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = get_valid_guest_pte(vm, ipa, &pte, &phys); + if (ret) + goto unlock; + + ret = -EPERM; + if (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)) != PKVM_PAGE_SHARED_OWNED) + goto unlock; + if (__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)) + goto unlock; + + ret = 0; + meta = host_stage2_encode_gfn_meta(vm, gfn); + WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, + PKVM_ID_GUEST, meta)); + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + int __pkvm_host_unshare_hyp(u64 pfn) { u64 phys = hyp_pfn_to_phys(pfn); @@ -960,6 +1232,176 @@ static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *s return 0; } +static void hyp_poison_page(phys_addr_t phys) +{ + void *addr = hyp_fixmap_map(phys); + + memset(addr, 0, PAGE_SIZE); + /* + * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page() + * here as the latter may elide the CMO under the assumption that FWB + * will be enabled on CPUs that support it. This is incorrect for the + * host stage-2 and would otherwise lead to a malicious host potentially + * being able to read the contents of newly reclaimed guest pages. + */ + kvm_flush_dcache_to_poc(addr, PAGE_SIZE); + hyp_fixmap_unmap(); +} + +static int host_stage2_get_guest_info(phys_addr_t phys, struct pkvm_hyp_vm **vm, + u64 *gfn) +{ + enum pkvm_page_state state; + kvm_pte_t pte; + s8 level; + int ret; + + if (!addr_is_memory(phys)) + return -EFAULT; + + state = get_host_state(hyp_phys_to_page(phys)); + switch (state) { + case PKVM_PAGE_OWNED: + case PKVM_PAGE_SHARED_OWNED: + case PKVM_PAGE_SHARED_BORROWED: + /* The access should no longer fault; try again. */ + return -EAGAIN; + case PKVM_NOPAGE: + break; + default: + return -EPERM; + } + + ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, &level); + if (ret) + return ret; + + if (WARN_ON(level != KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + return host_stage2_decode_gfn_meta(pte, vm, gfn); +} + +int __pkvm_host_force_reclaim_page_guest(phys_addr_t phys) +{ + struct pkvm_hyp_vm *vm; + u64 gfn, ipa, pa; + kvm_pte_t pte; + int ret; + + phys &= PAGE_MASK; + + hyp_spin_lock(&vm_table_lock); + host_lock_component(); + + ret = host_stage2_get_guest_info(phys, &vm, &gfn); + if (ret) + goto unlock_host; + + ipa = hyp_pfn_to_phys(gfn); + guest_lock_component(vm); + ret = get_valid_guest_pte(vm, ipa, &pte, &pa); + if (ret) + goto unlock_guest; + + WARN_ON(pa != phys); + if (guest_get_page_state(pte, ipa) != PKVM_PAGE_OWNED) { + ret = -EPERM; + goto unlock_guest; + } + + /* We really shouldn't be allocating, so don't pass a memcache */ + ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, NULL, + KVM_GUEST_INVALID_PTE_TYPE_POISONED, + 0); + if (ret) + goto unlock_guest; + + hyp_poison_page(phys); + WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST)); +unlock_guest: + guest_unlock_component(vm); +unlock_host: + host_unlock_component(); + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + kvm_pte_t pte; + u64 phys; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = get_valid_guest_pte(vm, ipa, &pte, &phys); + if (ret) + goto unlock; + + switch (guest_get_page_state(pte, ipa)) { + case PKVM_PAGE_OWNED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE)); + hyp_poison_page(phys); + break; + case PKVM_PAGE_SHARED_OWNED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)); + break; + default: + ret = -EPERM; + goto unlock; + } + + WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE)); + WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + /* + * -EHWPOISON implies that the page was forcefully reclaimed already + * so return success for the GUP pin to be dropped. + */ + return ret && ret != -EHWPOISON ? ret : 0; +} + +int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 meta; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + + ret = __guest_check_page_state_range(vm, ipa, PAGE_SIZE, PKVM_NOPAGE); + if (ret) + goto unlock; + + meta = host_stage2_encode_gfn_meta(vm, gfn); + WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, + PKVM_ID_GUEST, meta)); + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} + int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { @@ -1206,53 +1648,18 @@ struct pkvm_expected_state { static struct pkvm_expected_state selftest_state; static struct hyp_page *selftest_page; - -static struct pkvm_hyp_vm selftest_vm = { - .kvm = { - .arch = { - .mmu = { - .arch = &selftest_vm.kvm.arch, - .pgt = &selftest_vm.pgt, - }, - }, - }, -}; - -static struct pkvm_hyp_vcpu selftest_vcpu = { - .vcpu = { - .arch = { - .hw_mmu = &selftest_vm.kvm.arch.mmu, - }, - .kvm = &selftest_vm.kvm, - }, -}; - -static void init_selftest_vm(void *virt) -{ - struct hyp_page *p = hyp_virt_to_page(virt); - int i; - - selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; - WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); - - for (i = 0; i < pkvm_selftest_pages(); i++) { - if (p[i].refcount) - continue; - p[i].refcount = 1; - hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); - } -} +static struct pkvm_hyp_vcpu *selftest_vcpu; static u64 selftest_ipa(void) { - return BIT(selftest_vm.pgt.ia_bits - 1); + return BIT(selftest_vcpu->vcpu.arch.hw_mmu->pgt->ia_bits - 1); } static void assert_page_state(void) { void *virt = hyp_page_to_virt(selftest_page); u64 size = PAGE_SIZE << selftest_page->order; - struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; + struct pkvm_hyp_vcpu *vcpu = selftest_vcpu; u64 phys = hyp_virt_to_phys(virt); u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE }; struct pkvm_hyp_vm *vm; @@ -1267,10 +1674,10 @@ static void assert_page_state(void) WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp)); hyp_unlock_component(); - guest_lock_component(&selftest_vm); + guest_lock_component(vm); WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0])); WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1])); - guest_unlock_component(&selftest_vm); + guest_unlock_component(vm); } #define assert_transition_res(res, fn, ...) \ @@ -1283,14 +1690,15 @@ void pkvm_ownership_selftest(void *base) { enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX; void *virt = hyp_alloc_pages(&host_s2_pool, 0); - struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu; - struct pkvm_hyp_vm *vm = &selftest_vm; + struct pkvm_hyp_vcpu *vcpu; u64 phys, size, pfn, gfn; + struct pkvm_hyp_vm *vm; WARN_ON(!virt); selftest_page = hyp_virt_to_page(virt); selftest_page->refcount = 0; - init_selftest_vm(base); + selftest_vcpu = vcpu = init_selftest_vm(base); + vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); size = PAGE_SIZE << selftest_page->order; phys = hyp_virt_to_phys(virt); @@ -1309,6 +1717,7 @@ void pkvm_ownership_selftest(void *base) assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); selftest_state.host = PKVM_PAGE_OWNED; selftest_state.hyp = PKVM_NOPAGE; @@ -1328,6 +1737,7 @@ void pkvm_ownership_selftest(void *base) assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size); @@ -1340,6 +1750,7 @@ void pkvm_ownership_selftest(void *base) assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); hyp_unpin_shared_mem(virt, virt + size); assert_page_state(); @@ -1359,6 +1770,7 @@ void pkvm_ownership_selftest(void *base) assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); selftest_state.host = PKVM_PAGE_OWNED; @@ -1375,6 +1787,7 @@ void pkvm_ownership_selftest(void *base) assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size); selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED; @@ -1388,10 +1801,70 @@ void pkvm_ownership_selftest(void *base) selftest_state.host = PKVM_PAGE_OWNED; assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm); + selftest_state.host = PKVM_NOPAGE; + selftest_state.guest[0] = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + + selftest_state.host = PKVM_PAGE_SHARED_BORROWED; + selftest_state.guest[0] = PKVM_PAGE_SHARED_OWNED; + assert_transition_res(0, __pkvm_guest_share_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_guest_share_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.guest[0] = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_guest_unshare_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_guest_unshare_host, vcpu, gfn); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot); + assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1); + assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn); + assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.guest[0] = PKVM_POISON; + assert_transition_res(0, __pkvm_host_force_reclaim_page_guest, phys); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + assert_transition_res(-EHWPOISON, __pkvm_guest_share_host, vcpu, gfn); + assert_transition_res(-EHWPOISON, __pkvm_guest_unshare_host, vcpu, gfn); + + selftest_state.host = PKVM_NOPAGE; + selftest_state.guest[1] = PKVM_PAGE_OWNED; + assert_transition_res(0, __pkvm_host_donate_guest, pfn, gfn + 1, vcpu); + + selftest_state.host = PKVM_PAGE_OWNED; + selftest_state.guest[1] = PKVM_NOPAGE; + assert_transition_res(0, __pkvm_host_reclaim_page_guest, gfn + 1, vm); + assert_transition_res(-EPERM, __pkvm_host_donate_guest, pfn, gfn, vcpu); + assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot); + selftest_state.host = PKVM_NOPAGE; selftest_state.hyp = PKVM_PAGE_OWNED; assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1); + teardown_selftest_vm(); selftest_page->refcount = 1; hyp_put_page(&host_s2_pool, virt); } diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 218976287d3f..c98cec0c150f 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -244,7 +244,7 @@ static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys) void *hyp_fixmap_map(phys_addr_t phys) { - return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys); + return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys) + offset_in_page(phys); } static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) @@ -366,7 +366,7 @@ void *hyp_fixblock_map(phys_addr_t phys, size_t *size) #ifdef HAS_FIXBLOCK *size = PMD_SIZE; hyp_spin_lock(&hyp_fixblock_lock); - return fixmap_map_slot(&hyp_fixblock_slot, phys); + return fixmap_map_slot(&hyp_fixblock_slot, phys) + offset_in_page(phys); #else *size = PAGE_SIZE; return hyp_fixmap_map(phys); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 2f029bfe4755..7ed96d64d611 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -4,6 +4,8 @@ * Author: Fuad Tabba */ +#include + #include #include @@ -222,6 +224,7 @@ static struct pkvm_hyp_vm **vm_table; void pkvm_hyp_vm_table_init(void *tbl) { + BUILD_BUG_ON((u64)HANDLE_OFFSET + KVM_MAX_PVMS > (pkvm_handle_t)-1); WARN_ON(vm_table); vm_table = tbl; } @@ -229,10 +232,12 @@ void pkvm_hyp_vm_table_init(void *tbl) /* * Return the hyp vm structure corresponding to the handle. */ -static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) +struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) { unsigned int idx = vm_handle_to_idx(handle); + hyp_assert_lock_held(&vm_table_lock); + if (unlikely(idx >= KVM_MAX_PVMS)) return NULL; @@ -255,7 +260,10 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm || hyp_vm->kvm.created_vcpus <= vcpu_idx) + if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) + goto unlock; + + if (hyp_vm->kvm.created_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; @@ -719,6 +727,55 @@ void __pkvm_unreserve_vm(pkvm_handle_t handle) hyp_spin_unlock(&vm_table_lock); } +#ifdef CONFIG_NVHE_EL2_DEBUG +static struct pkvm_hyp_vm selftest_vm = { + .kvm = { + .arch = { + .mmu = { + .arch = &selftest_vm.kvm.arch, + .pgt = &selftest_vm.pgt, + }, + }, + }, +}; + +static struct pkvm_hyp_vcpu selftest_vcpu = { + .vcpu = { + .arch = { + .hw_mmu = &selftest_vm.kvm.arch.mmu, + }, + .kvm = &selftest_vm.kvm, + }, +}; + +struct pkvm_hyp_vcpu *init_selftest_vm(void *virt) +{ + struct hyp_page *p = hyp_virt_to_page(virt); + int i; + + selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + + for (i = 0; i < pkvm_selftest_pages(); i++) { + if (p[i].refcount) + continue; + p[i].refcount = 1; + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } + + selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm(); + insert_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle, &selftest_vm); + return &selftest_vcpu; +} + +void teardown_selftest_vm(void) +{ + hyp_spin_lock(&vm_table_lock); + remove_vm_table_entry(selftest_vm.kvm.arch.pkvm.handle); + hyp_spin_unlock(&vm_table_lock); +} +#endif /* CONFIG_NVHE_EL2_DEBUG */ + /* * Initialize the hypervisor copy of the VM state using host-donated memory. * @@ -859,7 +916,54 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) unmap_donated_memory_noclear(addr, size); } -int __pkvm_teardown_vm(pkvm_handle_t handle) +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 gfn) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + int ret = -EINVAL; + + if (!hyp_vm) + return ret; + + if (hyp_vm->kvm.arch.pkvm.is_dying) + ret = __pkvm_host_reclaim_page_guest(gfn, hyp_vm); + + put_pkvm_hyp_vm(hyp_vm); + return ret; +} + +static struct pkvm_hyp_vm *get_pkvm_unref_hyp_vm_locked(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_assert_lock_held(&vm_table_lock); + + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || hyp_page_count(hyp_vm)) + return NULL; + + return hyp_vm; +} + +int __pkvm_start_teardown_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + int ret = 0; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_pkvm_unref_hyp_vm_locked(handle); + if (!hyp_vm || hyp_vm->kvm.arch.pkvm.is_dying) { + ret = -EINVAL; + goto unlock; + } + + hyp_vm->kvm.arch.pkvm.is_dying = true; +unlock: + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle) { struct kvm_hyp_memcache *mc, *stage2_mc; struct pkvm_hyp_vm *hyp_vm; @@ -869,14 +973,9 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) int err; hyp_spin_lock(&vm_table_lock); - hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm) { - err = -ENOENT; - goto err_unlock; - } - - if (WARN_ON(hyp_page_count(hyp_vm))) { - err = -EBUSY; + hyp_vm = get_pkvm_unref_hyp_vm_locked(handle); + if (!hyp_vm || !hyp_vm->kvm.arch.pkvm.is_dying) { + err = -EINVAL; goto err_unlock; } @@ -922,3 +1021,121 @@ err_unlock: hyp_spin_unlock(&vm_table_lock); return err; } + +static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa) +{ + u64 elr; + + /* Fake up a data abort (level 3 translation fault on write) */ + vcpu->arch.fault.esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) | + ESR_ELx_WNR | ESR_ELx_FSC_FAULT | + FIELD_PREP(ESR_ELx_FSC_LEVEL, 3); + + /* Shuffle the IPA around into the HPFAR */ + vcpu->arch.fault.hpfar_el2 = (HPFAR_EL2_NS | (ipa >> 8)) & HPFAR_MASK; + + /* This is a virtual address. 0's good. Let's go with 0. */ + vcpu->arch.fault.far_el2 = 0; + + /* Rewind the ELR so we return to the HVC once the IPA is mapped */ + elr = read_sysreg(elr_el2); + elr -= 4; + write_sysreg(elr, elr_el2); + + return ARM_EXCEPTION_TRAP; +} + +static bool pkvm_memshare_call(u64 *ret, struct kvm_vcpu *vcpu, u64 *exit_code) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + u64 ipa = smccc_get_arg1(vcpu); + + if (!PAGE_ALIGNED(ipa)) + goto out_guest; + + hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); + switch (__pkvm_guest_share_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) { + case 0: + ret[0] = SMCCC_RET_SUCCESS; + goto out_guest; + case -ENOENT: + /* + * Convert the exception into a data abort so that the page + * being shared is mapped into the guest next time. + */ + *exit_code = __pkvm_memshare_page_req(vcpu, ipa); + goto out_host; + } + +out_guest: + return true; +out_host: + return false; +} + +static void pkvm_memunshare_call(u64 *ret, struct kvm_vcpu *vcpu) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + u64 ipa = smccc_get_arg1(vcpu); + + if (!PAGE_ALIGNED(ipa)) + return; + + hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu); + if (!__pkvm_guest_unshare_host(hyp_vcpu, hyp_phys_to_pfn(ipa))) + ret[0] = SMCCC_RET_SUCCESS; +} + +/* + * Handler for protected VM HVC calls. + * + * Returns true if the hypervisor has handled the exit (and control + * should return to the guest) or false if it hasn't (and the handling + * should be performed by the host). + */ +bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 val[4] = { SMCCC_RET_INVALID_PARAMETER }; + bool handled = true; + + switch (smccc_get_function(vcpu)) { + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID: + if (smccc_get_arg1(vcpu) || + smccc_get_arg2(vcpu) || + smccc_get_arg3(vcpu)) { + break; + } + + val[0] = PAGE_SIZE; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID: + if (smccc_get_arg2(vcpu) || + smccc_get_arg3(vcpu)) { + break; + } + + handled = pkvm_memshare_call(val, vcpu, exit_code); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID: + if (smccc_get_arg2(vcpu) || + smccc_get_arg3(vcpu)) { + break; + } + + pkvm_memunshare_call(val, vcpu); + break; + default: + /* Punt everything else back to the host, for now. */ + handled = false; + } + + if (handled) + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); + return handled; +} diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index c3e196fb8b18..e20db999e328 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -6,11 +6,12 @@ #include #include +#include #include -#include #include #include +#include #include #include @@ -65,7 +66,7 @@ static unsigned long psci_call(unsigned long fn, unsigned long arg0, { struct arm_smccc_res res; - arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res); + hyp_smccc_1_1_smc(fn, arg0, arg1, arg2, &res); return res.a0; } @@ -200,30 +201,42 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt) __hyp_pa(init_params), 0); } -asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) +static void __noreturn __kvm_host_psci_cpu_entry(unsigned long pc, unsigned long r0) { - struct psci_boot_args *boot_args; - struct kvm_cpu_context *host_ctxt; + struct kvm_cpu_context *host_ctxt = host_data_ptr(host_ctxt); - host_ctxt = host_data_ptr(host_ctxt); + trace_hyp_enter(host_ctxt, HYP_REASON_PSCI); - if (is_cpu_on) - boot_args = this_cpu_ptr(&cpu_on_args); - else - boot_args = this_cpu_ptr(&suspend_args); - - cpu_reg(host_ctxt, 0) = boot_args->r0; - write_sysreg_el2(boot_args->pc, SYS_ELR); - - if (is_cpu_on) - release_boot_args(boot_args); + cpu_reg(host_ctxt, 0) = r0; + write_sysreg_el2(pc, SYS_ELR); write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + trace_hyp_exit(host_ctxt, HYP_REASON_PSCI); __host_enter(host_ctxt); } +asmlinkage void __noreturn __kvm_host_psci_cpu_on_entry(void) +{ + struct psci_boot_args *boot_args = this_cpu_ptr(&cpu_on_args); + unsigned long pc, r0; + + pc = READ_ONCE(boot_args->pc); + r0 = READ_ONCE(boot_args->r0); + + release_boot_args(boot_args); + + __kvm_host_psci_cpu_entry(pc, r0); +} + +asmlinkage void __noreturn __kvm_host_psci_cpu_resume_entry(void) +{ + struct psci_boot_args *boot_args = this_cpu_ptr(&suspend_args); + + __kvm_host_psci_cpu_entry(boot_args->pc, boot_args->r0); +} + static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id)) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 90bd014e952f..d8e5b563fd3d 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -341,8 +341,7 @@ out: __host_enter(host_ctxt); } -int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, - unsigned long *per_cpu_base, u32 hyp_va_bits) +int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params; void *virt = hyp_phys_to_virt(phys); @@ -355,7 +354,6 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, return -EINVAL; hyp_spin_lock_init(&pkvm_pgd_lock); - hyp_nr_cpus = nr_cpus; ret = divide_memory_pool(virt, size); if (ret) diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index 5b6eeab1a774..7c832d60d22b 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -34,7 +34,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) stacktrace_info->pc = pc; } -#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#ifdef CONFIG_PKVM_STACKTRACE #include DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); @@ -134,11 +134,11 @@ static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) unwind(&state, pkvm_save_backtrace_entry, &idx); } -#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +#else /* !CONFIG_PKVM_STACKTRACE */ static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) { } -#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ +#endif /* CONFIG_PKVM_STACKTRACE */ /* * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 779089e42681..8d1df3d33595 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -7,7 +7,6 @@ #include #include -#include #include #include #include @@ -21,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +44,9 @@ struct fgt_masks hfgwtr2_masks; struct fgt_masks hfgitr2_masks; struct fgt_masks hdfgrtr2_masks; struct fgt_masks hdfgwtr2_masks; +struct fgt_masks ich_hfgrtr_masks; +struct fgt_masks ich_hfgwtr_masks; +struct fgt_masks ich_hfgitr_masks; extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); @@ -110,6 +113,12 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) /* Save VGICv3 state on non-VHE systems */ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu) { + if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) { + __vgic_v5_save_state(&vcpu->arch.vgic_cpu.vgic_v5); + __vgic_v5_save_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5); + return; + } + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); @@ -119,6 +128,12 @@ static void __hyp_vgic_save_state(struct kvm_vcpu *vcpu) /* Restore VGICv3 state on non-VHE systems */ static void __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) { + if (vgic_is_v5(kern_hyp_va(vcpu->kvm))) { + __vgic_v5_restore_state(&vcpu->arch.vgic_cpu.vgic_v5); + __vgic_v5_restore_ppi_state(&vcpu->arch.vgic_cpu.vgic_v5); + return; + } + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); @@ -190,6 +205,7 @@ static const exit_handler_fn hyp_exit_handlers[] = { static const exit_handler_fn pvm_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_HVC64] = kvm_handle_pvm_hvc64, [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, @@ -278,7 +294,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) * We're about to restore some new MMU state. Make sure * ongoing page-table walks that have started before we * trapped to EL2 have completed. This also synchronises the - * above disabling of BRBE, SPE and TRBE. + * above disabling of BRBE. * * See DDI0487I.a D8.1.5 "Out-of-context translation regimes", * rule R_LFHQG and subsequent information statements. @@ -308,10 +324,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __debug_switch_to_guest(vcpu); do { + trace_hyp_exit(host_ctxt, HYP_REASON_ERET_GUEST); + /* Jump in the fire! */ exit_code = __guest_enter(vcpu); /* And we're baaack! */ + trace_hyp_enter(host_ctxt, HYP_REASON_GUEST_EXIT); } while (fixup_guest_exit(vcpu, &exit_code)); __sysreg_save_state_nvhe(guest_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 06d28621722e..8c3fbb413a06 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -20,6 +20,7 @@ */ u64 id_aa64pfr0_el1_sys_val; u64 id_aa64pfr1_el1_sys_val; +u64 id_aa64pfr2_el1_sys_val; u64 id_aa64isar0_el1_sys_val; u64 id_aa64isar1_el1_sys_val; u64 id_aa64isar2_el1_sys_val; @@ -108,6 +109,11 @@ static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { FEAT_END }; +static const struct pvm_ftr_bits pvmid_aa64pfr2[] = { + MAX_FEAT(ID_AA64PFR2_EL1, GCIE, NI), + FEAT_END +}; + static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), @@ -221,6 +227,8 @@ static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); case SYS_ID_AA64PFR1_EL1: return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); + case SYS_ID_AA64PFR2_EL1: + return get_restricted_features(vcpu, id_aa64pfr2_el1_sys_val, pvmid_aa64pfr2); case SYS_ID_AA64ISAR0_EL1: return id_aa64isar0_el1_sys_val; case SYS_ID_AA64ISAR1_EL1: @@ -392,6 +400,14 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Cache maintenance by set/way operations are restricted. */ /* Debug and Trace Registers are restricted. */ + RAZ_WI(SYS_DBGBVRn_EL1(0)), + RAZ_WI(SYS_DBGBCRn_EL1(0)), + RAZ_WI(SYS_DBGWVRn_EL1(0)), + RAZ_WI(SYS_DBGWCRn_EL1(0)), + RAZ_WI(SYS_MDSCR_EL1), + RAZ_WI(SYS_OSLAR_EL1), + RAZ_WI(SYS_OSLSR_EL1), + RAZ_WI(SYS_OSDLR_EL1), /* Group 1 ID registers */ HOST_HANDLED(SYS_REVIDR_EL1), @@ -431,7 +447,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* CRm=4 */ AARCH64(SYS_ID_AA64PFR0_EL1), AARCH64(SYS_ID_AA64PFR1_EL1), - ID_UNALLOCATED(4,2), + AARCH64(SYS_ID_AA64PFR2_EL1), ID_UNALLOCATED(4,3), AARCH64(SYS_ID_AA64ZFR0_EL1), ID_UNALLOCATED(4,5), diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c new file mode 100644 index 000000000000..a6ca27b18e15 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/trace.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Vincent Donnefort + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "simple_ring_buffer.c" + +static DEFINE_PER_CPU(struct simple_rb_per_cpu, __simple_rbs); + +static struct hyp_trace_buffer { + struct simple_rb_per_cpu __percpu *simple_rbs; + void *bpages_backing_start; + size_t bpages_backing_size; + hyp_spinlock_t lock; +} trace_buffer = { + .simple_rbs = &__simple_rbs, + .lock = __HYP_SPIN_LOCK_UNLOCKED, +}; + +static bool hyp_trace_buffer_loaded(struct hyp_trace_buffer *trace_buffer) +{ + return trace_buffer->bpages_backing_size > 0; +} + +void *tracing_reserve_entry(unsigned long length) +{ + return simple_ring_buffer_reserve(this_cpu_ptr(trace_buffer.simple_rbs), length, + trace_clock()); +} + +void tracing_commit_entry(void) +{ + simple_ring_buffer_commit(this_cpu_ptr(trace_buffer.simple_rbs)); +} + +static int __admit_host_mem(void *start, u64 size) +{ + if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size) || !size) + return -EINVAL; + + if (!is_protected_kvm_enabled()) + return 0; + + return __pkvm_host_donate_hyp(hyp_virt_to_pfn(start), size >> PAGE_SHIFT); +} + +static void __release_host_mem(void *start, u64 size) +{ + if (!is_protected_kvm_enabled()) + return; + + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT)); +} + +static int hyp_trace_buffer_load_bpage_backing(struct hyp_trace_buffer *trace_buffer, + struct hyp_trace_desc *desc) +{ + void *start = (void *)kern_hyp_va(desc->bpages_backing_start); + size_t size = desc->bpages_backing_size; + int ret; + + ret = __admit_host_mem(start, size); + if (ret) + return ret; + + memset(start, 0, size); + + trace_buffer->bpages_backing_start = start; + trace_buffer->bpages_backing_size = size; + + return 0; +} + +static void hyp_trace_buffer_unload_bpage_backing(struct hyp_trace_buffer *trace_buffer) +{ + void *start = trace_buffer->bpages_backing_start; + size_t size = trace_buffer->bpages_backing_size; + + if (!size) + return; + + memset(start, 0, size); + + __release_host_mem(start, size); + + trace_buffer->bpages_backing_start = 0; + trace_buffer->bpages_backing_size = 0; +} + +static void *__pin_shared_page(unsigned long kern_va) +{ + void *va = kern_hyp_va((void *)kern_va); + + if (!is_protected_kvm_enabled()) + return va; + + return hyp_pin_shared_mem(va, va + PAGE_SIZE) ? NULL : va; +} + +static void __unpin_shared_page(void *va) +{ + if (!is_protected_kvm_enabled()) + return; + + hyp_unpin_shared_mem(va, va + PAGE_SIZE); +} + +static void hyp_trace_buffer_unload(struct hyp_trace_buffer *trace_buffer) +{ + int cpu; + + hyp_assert_lock_held(&trace_buffer->lock); + + if (!hyp_trace_buffer_loaded(trace_buffer)) + return; + + for (cpu = 0; cpu < hyp_nr_cpus; cpu++) + simple_ring_buffer_unload_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu), + __unpin_shared_page); + + hyp_trace_buffer_unload_bpage_backing(trace_buffer); +} + +static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer, + struct hyp_trace_desc *desc) +{ + struct simple_buffer_page *bpages; + struct ring_buffer_desc *rb_desc; + int ret, cpu; + + hyp_assert_lock_held(&trace_buffer->lock); + + if (hyp_trace_buffer_loaded(trace_buffer)) + return -EINVAL; + + ret = hyp_trace_buffer_load_bpage_backing(trace_buffer, desc); + if (ret) + return ret; + + bpages = trace_buffer->bpages_backing_start; + for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) { + ret = simple_ring_buffer_init_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu), + bpages, rb_desc, __pin_shared_page, + __unpin_shared_page); + if (ret) + break; + + bpages += rb_desc->nr_page_va; + } + + if (ret) + hyp_trace_buffer_unload(trace_buffer); + + return ret; +} + +static bool hyp_trace_desc_validate(struct hyp_trace_desc *desc, size_t desc_size) +{ + struct ring_buffer_desc *rb_desc; + unsigned int cpu; + size_t nr_bpages; + void *desc_end; + + /* + * Both desc_size and bpages_backing_size are untrusted host-provided + * values. We rely on __pkvm_host_donate_hyp() to enforce their validity. + */ + desc_end = (void *)desc + desc_size; + nr_bpages = desc->bpages_backing_size / sizeof(struct simple_buffer_page); + + for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) { + /* Can we read nr_page_va? */ + if ((void *)rb_desc + struct_size(rb_desc, page_va, 0) > desc_end) + return false; + + /* Overflow desc? */ + if ((void *)rb_desc + struct_size(rb_desc, page_va, rb_desc->nr_page_va) > desc_end) + return false; + + /* Overflow bpages backing memory? */ + if (nr_bpages < rb_desc->nr_page_va) + return false; + + if (cpu >= hyp_nr_cpus) + return false; + + if (cpu != rb_desc->cpu) + return false; + + nr_bpages -= rb_desc->nr_page_va; + } + + return true; +} + +int __tracing_load(unsigned long desc_hva, size_t desc_size) +{ + struct hyp_trace_desc *desc = (struct hyp_trace_desc *)kern_hyp_va(desc_hva); + int ret; + + ret = __admit_host_mem(desc, desc_size); + if (ret) + return ret; + + if (!hyp_trace_desc_validate(desc, desc_size)) + goto err_release_desc; + + hyp_spin_lock(&trace_buffer.lock); + + ret = hyp_trace_buffer_load(&trace_buffer, desc); + + hyp_spin_unlock(&trace_buffer.lock); + +err_release_desc: + __release_host_mem(desc, desc_size); + return ret; +} + +void __tracing_unload(void) +{ + hyp_spin_lock(&trace_buffer.lock); + hyp_trace_buffer_unload(&trace_buffer); + hyp_spin_unlock(&trace_buffer.lock); +} + +int __tracing_enable(bool enable) +{ + int cpu, ret = enable ? -EINVAL : 0; + + hyp_spin_lock(&trace_buffer.lock); + + if (!hyp_trace_buffer_loaded(&trace_buffer)) + goto unlock; + + for (cpu = 0; cpu < hyp_nr_cpus; cpu++) + simple_ring_buffer_enable_tracing(per_cpu_ptr(trace_buffer.simple_rbs, cpu), + enable); + + ret = 0; + +unlock: + hyp_spin_unlock(&trace_buffer.lock); + + return ret; +} + +int __tracing_swap_reader(unsigned int cpu) +{ + int ret = -ENODEV; + + if (cpu >= hyp_nr_cpus) + return -EINVAL; + + hyp_spin_lock(&trace_buffer.lock); + + if (hyp_trace_buffer_loaded(&trace_buffer)) + ret = simple_ring_buffer_swap_reader_page( + per_cpu_ptr(trace_buffer.simple_rbs, cpu)); + + hyp_spin_unlock(&trace_buffer.lock); + + return ret; +} + +void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) +{ + int cpu; + + /* After this loop, all CPUs are observing the new bank... */ + for (cpu = 0; cpu < hyp_nr_cpus; cpu++) { + struct simple_rb_per_cpu *simple_rb = per_cpu_ptr(trace_buffer.simple_rbs, cpu); + + while (READ_ONCE(simple_rb->status) == SIMPLE_RB_WRITING) + ; + } + + /* ...we can now override the old one and swap. */ + trace_clock_update(mult, shift, epoch_ns, epoch_cyc); +} + +int __tracing_reset(unsigned int cpu) +{ + int ret = -ENODEV; + + if (cpu >= hyp_nr_cpus) + return -EINVAL; + + hyp_spin_lock(&trace_buffer.lock); + + if (hyp_trace_buffer_loaded(&trace_buffer)) + ret = simple_ring_buffer_reset(per_cpu_ptr(trace_buffer.simple_rbs, cpu)); + + hyp_spin_unlock(&trace_buffer.lock); + + return ret; +} diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9b480f947da2..84c7a1df845d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -114,11 +114,6 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) return pte; } -static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) -{ - return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); -} - static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) @@ -581,7 +576,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) struct stage2_map_data { const u64 phys; kvm_pte_t attr; - u8 owner_id; + kvm_pte_t pte_annot; kvm_pte_t *anchor; kvm_pte_t *childp; @@ -798,7 +793,11 @@ static bool stage2_pte_is_counted(kvm_pte_t pte) static bool stage2_pte_is_locked(kvm_pte_t pte) { - return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); + if (kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_INVALID_PTE_TYPE_MASK, pte) == + KVM_INVALID_PTE_TYPE_LOCKED; } static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) @@ -829,6 +828,7 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t locked_pte; if (stage2_pte_is_locked(ctx->old)) { /* @@ -839,7 +839,9 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, return false; } - if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) + locked_pte = FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK, + KVM_INVALID_PTE_TYPE_LOCKED); + if (!stage2_try_set_pte(ctx, locked_pte)) return false; if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { @@ -964,7 +966,7 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else - new = kvm_init_invalid_leaf_owner(data->owner_id); + new = data->pte_annot; /* * Skip updating the PTE if we are trying to recreate the exact @@ -1118,16 +1120,18 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, - void *mc, u8 owner_id) +int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, enum kvm_invalid_pte_type type, + kvm_pte_t pte_annot) { int ret; struct stage2_map_data map_data = { .mmu = pgt->mmu, .memcache = mc, - .owner_id = owner_id, .force_pte = true, .annotation = true, + .pte_annot = pte_annot | + FIELD_PREP(KVM_INVALID_PTE_TYPE_MASK, type), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -1136,7 +1140,10 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .arg = &map_data, }; - if (owner_id > KVM_MAX_OWNER_ID) + if (pte_annot & ~KVM_INVALID_PTE_ANNOT_MASK) + return -EINVAL; + + if (!type || type == KVM_INVALID_PTE_TYPE_LOCKED) return -EINVAL; ret = kvm_pgtable_walk(pgt, addr, size, &walker); diff --git a/arch/arm64/kvm/hyp/vgic-v5-sr.c b/arch/arm64/kvm/hyp/vgic-v5-sr.c new file mode 100644 index 000000000000..47e6bcd43702 --- /dev/null +++ b/arch/arm64/kvm/hyp/vgic-v5-sr.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, 2026 - Arm Ltd + */ + +#include + +#include + +void __vgic_v5_save_apr(struct vgic_v5_cpu_if *cpu_if) +{ + cpu_if->vgic_apr = read_sysreg_s(SYS_ICH_APR_EL2); +} + +static void __vgic_v5_compat_mode_disable(void) +{ + sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, ICH_VCTLR_EL2_V3, 0); + isb(); +} + +void __vgic_v5_restore_vmcr_apr(struct vgic_v5_cpu_if *cpu_if) +{ + __vgic_v5_compat_mode_disable(); + + write_sysreg_s(cpu_if->vgic_vmcr, SYS_ICH_VMCR_EL2); + write_sysreg_s(cpu_if->vgic_apr, SYS_ICH_APR_EL2); +} + +void __vgic_v5_save_ppi_state(struct vgic_v5_cpu_if *cpu_if) +{ + /* + * The following code assumes that the bitmap storage that we have for + * PPIs is either 64 (architected PPIs, only) or 128 bits (architected & + * impdef PPIs). + */ + BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64); + + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit, + read_sysreg_s(SYS_ICH_PPI_ACTIVER0_EL2), 0, 64); + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->pendr, + read_sysreg_s(SYS_ICH_PPI_PENDR0_EL2), 0, 64); + + cpu_if->vgic_ppi_priorityr[0] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR0_EL2); + cpu_if->vgic_ppi_priorityr[1] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR1_EL2); + cpu_if->vgic_ppi_priorityr[2] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR2_EL2); + cpu_if->vgic_ppi_priorityr[3] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR3_EL2); + cpu_if->vgic_ppi_priorityr[4] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR4_EL2); + cpu_if->vgic_ppi_priorityr[5] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR5_EL2); + cpu_if->vgic_ppi_priorityr[6] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR6_EL2); + cpu_if->vgic_ppi_priorityr[7] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR7_EL2); + + if (VGIC_V5_NR_PRIVATE_IRQS == 128) { + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->activer_exit, + read_sysreg_s(SYS_ICH_PPI_ACTIVER1_EL2), 64, 64); + bitmap_write(host_data_ptr(vgic_v5_ppi_state)->pendr, + read_sysreg_s(SYS_ICH_PPI_PENDR1_EL2), 64, 64); + + cpu_if->vgic_ppi_priorityr[8] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR8_EL2); + cpu_if->vgic_ppi_priorityr[9] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR9_EL2); + cpu_if->vgic_ppi_priorityr[10] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR10_EL2); + cpu_if->vgic_ppi_priorityr[11] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR11_EL2); + cpu_if->vgic_ppi_priorityr[12] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR12_EL2); + cpu_if->vgic_ppi_priorityr[13] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR13_EL2); + cpu_if->vgic_ppi_priorityr[14] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR14_EL2); + cpu_if->vgic_ppi_priorityr[15] = read_sysreg_s(SYS_ICH_PPI_PRIORITYR15_EL2); + } + + /* Now that we are done, disable DVI */ + write_sysreg_s(0, SYS_ICH_PPI_DVIR0_EL2); + write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2); +} + +void __vgic_v5_restore_ppi_state(struct vgic_v5_cpu_if *cpu_if) +{ + DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS); + + /* We assume 64 or 128 PPIs - see above comment */ + BUILD_BUG_ON(VGIC_V5_NR_PRIVATE_IRQS % 64); + + /* Enable DVI so that the guest's interrupt config takes over */ + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 0, 64), + SYS_ICH_PPI_DVIR0_EL2); + + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_activer, 0, 64), + SYS_ICH_PPI_ACTIVER0_EL2); + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_enabler, 0, 64), + SYS_ICH_PPI_ENABLER0_EL2); + + /* Update the pending state of the NON-DVI'd PPIs, only */ + bitmap_andnot(pendr, host_data_ptr(vgic_v5_ppi_state)->pendr, + cpu_if->vgic_ppi_dvir, VGIC_V5_NR_PRIVATE_IRQS); + write_sysreg_s(bitmap_read(pendr, 0, 64), SYS_ICH_PPI_PENDR0_EL2); + + write_sysreg_s(cpu_if->vgic_ppi_priorityr[0], + SYS_ICH_PPI_PRIORITYR0_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[1], + SYS_ICH_PPI_PRIORITYR1_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[2], + SYS_ICH_PPI_PRIORITYR2_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[3], + SYS_ICH_PPI_PRIORITYR3_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[4], + SYS_ICH_PPI_PRIORITYR4_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[5], + SYS_ICH_PPI_PRIORITYR5_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[6], + SYS_ICH_PPI_PRIORITYR6_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[7], + SYS_ICH_PPI_PRIORITYR7_EL2); + + if (VGIC_V5_NR_PRIVATE_IRQS == 128) { + /* Enable DVI so that the guest's interrupt config takes over */ + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_dvir, 64, 64), + SYS_ICH_PPI_DVIR1_EL2); + + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_activer, 64, 64), + SYS_ICH_PPI_ACTIVER1_EL2); + write_sysreg_s(bitmap_read(cpu_if->vgic_ppi_enabler, 64, 64), + SYS_ICH_PPI_ENABLER1_EL2); + write_sysreg_s(bitmap_read(pendr, 64, 64), + SYS_ICH_PPI_PENDR1_EL2); + + write_sysreg_s(cpu_if->vgic_ppi_priorityr[8], + SYS_ICH_PPI_PRIORITYR8_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[9], + SYS_ICH_PPI_PRIORITYR9_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[10], + SYS_ICH_PPI_PRIORITYR10_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[11], + SYS_ICH_PPI_PRIORITYR11_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[12], + SYS_ICH_PPI_PRIORITYR12_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[13], + SYS_ICH_PPI_PRIORITYR13_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[14], + SYS_ICH_PPI_PRIORITYR14_EL2); + write_sysreg_s(cpu_if->vgic_ppi_priorityr[15], + SYS_ICH_PPI_PRIORITYR15_EL2); + } else { + write_sysreg_s(0, SYS_ICH_PPI_DVIR1_EL2); + + write_sysreg_s(0, SYS_ICH_PPI_ACTIVER1_EL2); + write_sysreg_s(0, SYS_ICH_PPI_ENABLER1_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PENDR1_EL2); + + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR8_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR9_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR10_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR11_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR12_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR13_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR14_EL2); + write_sysreg_s(0, SYS_ICH_PPI_PRIORITYR15_EL2); + } +} + +void __vgic_v5_save_state(struct vgic_v5_cpu_if *cpu_if) +{ + cpu_if->vgic_vmcr = read_sysreg_s(SYS_ICH_VMCR_EL2); + cpu_if->vgic_icsr = read_sysreg_s(SYS_ICC_ICSR_EL1); +} + +void __vgic_v5_restore_state(struct vgic_v5_cpu_if *cpu_if) +{ + write_sysreg_s(cpu_if->vgic_icsr, SYS_ICC_ICSR_EL1); +} diff --git a/arch/arm64/kvm/hyp/vhe/Makefile b/arch/arm64/kvm/hyp/vhe/Makefile index afc4aed9231a..9695328bbd96 100644 --- a/arch/arm64/kvm/hyp/vhe/Makefile +++ b/arch/arm64/kvm/hyp/vhe/Makefile @@ -10,4 +10,4 @@ CFLAGS_switch.o += -Wno-override-init obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ - ../fpsimd.o ../hyp-entry.o ../exception.o + ../fpsimd.o ../hyp-entry.o ../exception.o ../vgic-v5-sr.o diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c new file mode 100644 index 000000000000..8b7f2bf2fba8 --- /dev/null +++ b/arch/arm64/kvm/hyp_trace.c @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Vincent Donnefort + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "hyp_trace.h" + +/* Same 10min used by clocksource when width is more than 32-bits */ +#define CLOCK_MAX_CONVERSION_S 600 +/* + * Time to give for the clock init. Long enough to get a good mult/shift + * estimation. Short enough to not delay the tracing start too much. + */ +#define CLOCK_INIT_MS 100 +/* + * Time between clock checks. Must be small enough to catch clock deviation when + * it is still tiny. + */ +#define CLOCK_UPDATE_MS 500 + +static struct hyp_trace_clock { + u64 cycles; + u64 cyc_overflow64; + u64 boot; + u32 mult; + u32 shift; + struct delayed_work work; + struct completion ready; + struct mutex lock; + bool running; +} hyp_clock; + +static void __hyp_clock_work(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct hyp_trace_clock *hyp_clock; + struct system_time_snapshot snap; + u64 rate, delta_cycles; + u64 boot, delta_boot; + + hyp_clock = container_of(dwork, struct hyp_trace_clock, work); + + ktime_get_snapshot(&snap); + boot = ktime_to_ns(snap.boot); + + delta_boot = boot - hyp_clock->boot; + delta_cycles = snap.cycles - hyp_clock->cycles; + + /* Compare hyp clock with the kernel boot clock */ + if (hyp_clock->mult) { + u64 err, cur = delta_cycles; + + if (WARN_ON_ONCE(cur >= hyp_clock->cyc_overflow64)) { + __uint128_t tmp = (__uint128_t)cur * hyp_clock->mult; + + cur = tmp >> hyp_clock->shift; + } else { + cur *= hyp_clock->mult; + cur >>= hyp_clock->shift; + } + cur += hyp_clock->boot; + + err = abs_diff(cur, boot); + /* No deviation, only update epoch if necessary */ + if (!err) { + if (delta_cycles >= (hyp_clock->cyc_overflow64 >> 1)) + goto fast_forward; + + goto resched; + } + + /* Warn if the error is above tracing precision (1us) */ + if (err > NSEC_PER_USEC) + pr_warn_ratelimited("hyp trace clock off by %lluus\n", + err / NSEC_PER_USEC); + } + + rate = div64_u64(delta_cycles * NSEC_PER_SEC, delta_boot); + + clocks_calc_mult_shift(&hyp_clock->mult, &hyp_clock->shift, + rate, NSEC_PER_SEC, CLOCK_MAX_CONVERSION_S); + + /* Add a comfortable 50% margin */ + hyp_clock->cyc_overflow64 = (U64_MAX / hyp_clock->mult) >> 1; + +fast_forward: + hyp_clock->cycles = snap.cycles; + hyp_clock->boot = boot; + kvm_call_hyp_nvhe(__tracing_update_clock, hyp_clock->mult, + hyp_clock->shift, hyp_clock->boot, hyp_clock->cycles); + complete(&hyp_clock->ready); + +resched: + schedule_delayed_work(&hyp_clock->work, + msecs_to_jiffies(CLOCK_UPDATE_MS)); +} + +static void hyp_trace_clock_enable(struct hyp_trace_clock *hyp_clock, bool enable) +{ + struct system_time_snapshot snap; + + if (hyp_clock->running == enable) + return; + + if (!enable) { + cancel_delayed_work_sync(&hyp_clock->work); + hyp_clock->running = false; + } + + ktime_get_snapshot(&snap); + + hyp_clock->boot = ktime_to_ns(snap.boot); + hyp_clock->cycles = snap.cycles; + hyp_clock->mult = 0; + + init_completion(&hyp_clock->ready); + INIT_DELAYED_WORK(&hyp_clock->work, __hyp_clock_work); + schedule_delayed_work(&hyp_clock->work, msecs_to_jiffies(CLOCK_INIT_MS)); + wait_for_completion(&hyp_clock->ready); + hyp_clock->running = true; +} + +/* Access to this struct within the trace_remote_callbacks are protected by the trace_remote lock */ +static struct hyp_trace_buffer { + struct hyp_trace_desc *desc; + size_t desc_size; +} trace_buffer; + +static int __map_hyp(void *start, size_t size) +{ + if (is_protected_kvm_enabled()) + return 0; + + return create_hyp_mappings(start, start + size, PAGE_HYP); +} + +static int __share_page(unsigned long va) +{ + return kvm_share_hyp((void *)va, (void *)va + 1); +} + +static void __unshare_page(unsigned long va) +{ + kvm_unshare_hyp((void *)va, (void *)va + 1); +} + +static int hyp_trace_buffer_alloc_bpages_backing(struct hyp_trace_buffer *trace_buffer, size_t size) +{ + int nr_bpages = (PAGE_ALIGN(size) / PAGE_SIZE) + 1; + size_t backing_size; + void *start; + + backing_size = PAGE_ALIGN(sizeof(struct simple_buffer_page) * nr_bpages * + num_possible_cpus()); + + start = alloc_pages_exact(backing_size, GFP_KERNEL_ACCOUNT); + if (!start) + return -ENOMEM; + + trace_buffer->desc->bpages_backing_start = (unsigned long)start; + trace_buffer->desc->bpages_backing_size = backing_size; + + return __map_hyp(start, backing_size); +} + +static void hyp_trace_buffer_free_bpages_backing(struct hyp_trace_buffer *trace_buffer) +{ + free_pages_exact((void *)trace_buffer->desc->bpages_backing_start, + trace_buffer->desc->bpages_backing_size); +} + +static void hyp_trace_buffer_unshare_hyp(struct hyp_trace_buffer *trace_buffer, int last_cpu) +{ + struct ring_buffer_desc *rb_desc; + int cpu, p; + + for_each_ring_buffer_desc(rb_desc, cpu, &trace_buffer->desc->trace_buffer_desc) { + if (cpu > last_cpu) + break; + + __share_page(rb_desc->meta_va); + for (p = 0; p < rb_desc->nr_page_va; p++) + __unshare_page(rb_desc->page_va[p]); + } +} + +static int hyp_trace_buffer_share_hyp(struct hyp_trace_buffer *trace_buffer) +{ + struct ring_buffer_desc *rb_desc; + int cpu, p, ret = 0; + + for_each_ring_buffer_desc(rb_desc, cpu, &trace_buffer->desc->trace_buffer_desc) { + ret = __share_page(rb_desc->meta_va); + if (ret) + break; + + for (p = 0; p < rb_desc->nr_page_va; p++) { + ret = __share_page(rb_desc->page_va[p]); + if (ret) + break; + } + + if (ret) { + for (p--; p >= 0; p--) + __unshare_page(rb_desc->page_va[p]); + break; + } + } + + if (ret) + hyp_trace_buffer_unshare_hyp(trace_buffer, cpu--); + + return ret; +} + +static struct trace_buffer_desc *hyp_trace_load(unsigned long size, void *priv) +{ + struct hyp_trace_buffer *trace_buffer = priv; + struct hyp_trace_desc *desc; + size_t desc_size; + int ret; + + if (WARN_ON(trace_buffer->desc)) + return ERR_PTR(-EINVAL); + + desc_size = trace_buffer_desc_size(size, num_possible_cpus()); + if (desc_size == SIZE_MAX) + return ERR_PTR(-E2BIG); + + desc_size = PAGE_ALIGN(desc_size); + desc = (struct hyp_trace_desc *)alloc_pages_exact(desc_size, GFP_KERNEL); + if (!desc) + return ERR_PTR(-ENOMEM); + + ret = __map_hyp(desc, desc_size); + if (ret) + goto err_free_desc; + + trace_buffer->desc = desc; + + ret = hyp_trace_buffer_alloc_bpages_backing(trace_buffer, size); + if (ret) + goto err_free_desc; + + ret = trace_remote_alloc_buffer(&desc->trace_buffer_desc, desc_size, size, + cpu_possible_mask); + if (ret) + goto err_free_backing; + + ret = hyp_trace_buffer_share_hyp(trace_buffer); + if (ret) + goto err_free_buffer; + + ret = kvm_call_hyp_nvhe(__tracing_load, (unsigned long)desc, desc_size); + if (ret) + goto err_unload_pages; + + return &desc->trace_buffer_desc; + +err_unload_pages: + hyp_trace_buffer_unshare_hyp(trace_buffer, INT_MAX); + +err_free_buffer: + trace_remote_free_buffer(&desc->trace_buffer_desc); + +err_free_backing: + hyp_trace_buffer_free_bpages_backing(trace_buffer); + +err_free_desc: + free_pages_exact(desc, desc_size); + trace_buffer->desc = NULL; + + return ERR_PTR(ret); +} + +static void hyp_trace_unload(struct trace_buffer_desc *desc, void *priv) +{ + struct hyp_trace_buffer *trace_buffer = priv; + + if (WARN_ON(desc != &trace_buffer->desc->trace_buffer_desc)) + return; + + kvm_call_hyp_nvhe(__tracing_unload); + hyp_trace_buffer_unshare_hyp(trace_buffer, INT_MAX); + trace_remote_free_buffer(desc); + hyp_trace_buffer_free_bpages_backing(trace_buffer); + free_pages_exact(trace_buffer->desc, trace_buffer->desc_size); + trace_buffer->desc = NULL; +} + +static int hyp_trace_enable_tracing(bool enable, void *priv) +{ + hyp_trace_clock_enable(&hyp_clock, enable); + + return kvm_call_hyp_nvhe(__tracing_enable, enable); +} + +static int hyp_trace_swap_reader_page(unsigned int cpu, void *priv) +{ + return kvm_call_hyp_nvhe(__tracing_swap_reader, cpu); +} + +static int hyp_trace_reset(unsigned int cpu, void *priv) +{ + return kvm_call_hyp_nvhe(__tracing_reset, cpu); +} + +static int hyp_trace_enable_event(unsigned short id, bool enable, void *priv) +{ + struct hyp_event_id *event_id = lm_alias(&__hyp_event_ids_start[id]); + struct page *page; + atomic_t *enabled; + void *map; + + if (is_protected_kvm_enabled()) + return kvm_call_hyp_nvhe(__tracing_enable_event, id, enable); + + enabled = &event_id->enabled; + page = virt_to_page(enabled); + map = vmap(&page, 1, VM_MAP, PAGE_KERNEL); + if (!map) + return -ENOMEM; + + enabled = map + offset_in_page(enabled); + atomic_set(enabled, enable); + + vunmap(map); + + return 0; +} + +static int hyp_trace_clock_show(struct seq_file *m, void *v) +{ + seq_puts(m, "[boot]\n"); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(hyp_trace_clock); + +static ssize_t hyp_trace_write_event_write(struct file *f, const char __user *ubuf, + size_t cnt, loff_t *pos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + kvm_call_hyp_nvhe(__tracing_write_event, val); + + return cnt; +} + +static const struct file_operations hyp_trace_write_event_fops = { + .write = hyp_trace_write_event_write, +}; + +static int hyp_trace_init_tracefs(struct dentry *d, void *priv) +{ + if (!tracefs_create_file("write_event", 0200, d, NULL, &hyp_trace_write_event_fops)) + return -ENOMEM; + + return tracefs_create_file("trace_clock", 0440, d, NULL, &hyp_trace_clock_fops) ? + 0 : -ENOMEM; +} + +static struct trace_remote_callbacks trace_remote_callbacks = { + .init = hyp_trace_init_tracefs, + .load_trace_buffer = hyp_trace_load, + .unload_trace_buffer = hyp_trace_unload, + .enable_tracing = hyp_trace_enable_tracing, + .swap_reader_page = hyp_trace_swap_reader_page, + .reset = hyp_trace_reset, + .enable_event = hyp_trace_enable_event, +}; + +static const char *__hyp_enter_exit_reason_str(u8 reason); + +#include + +static const char *__hyp_enter_exit_reason_str(u8 reason) +{ + static const char strs[][12] = { + "smc", + "hvc", + "psci", + "host_abort", + "guest_exit", + "eret_host", + "eret_guest", + "unknown", + }; + + return strs[min(reason, HYP_REASON_UNKNOWN)]; +} + +static void __init hyp_trace_init_events(void) +{ + struct hyp_event_id *hyp_event_id = __hyp_event_ids_start; + struct remote_event *event = __hyp_events_start; + int id = 0; + + /* Events on both sides hypervisor are sorted */ + for (; event < __hyp_events_end; event++, hyp_event_id++, id++) + event->id = hyp_event_id->id = id; +} + +int __init kvm_hyp_trace_init(void) +{ + int cpu; + + if (is_kernel_in_hyp_mode()) + return 0; + + for_each_possible_cpu(cpu) { + const struct arch_timer_erratum_workaround *wa = + per_cpu(timer_unstable_counter_workaround, cpu); + + if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND) && + wa && wa->read_cntvct_el0) { + pr_warn("hyp trace can't handle CNTVCT workaround '%s'\n", wa->desc); + return -EOPNOTSUPP; + } + } + + hyp_trace_init_events(); + + return trace_remote_register("hypervisor", &trace_remote_callbacks, &trace_buffer, + __hyp_events_start, __hyp_events_end - __hyp_events_start); +} diff --git a/arch/arm64/kvm/hyp_trace.h b/arch/arm64/kvm/hyp_trace.h new file mode 100644 index 000000000000..c991b1ec65f1 --- /dev/null +++ b/arch/arm64/kvm/hyp_trace.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ARM64_KVM_HYP_TRACE_H__ +#define __ARM64_KVM_HYP_TRACE_H__ + +#ifdef CONFIG_NVHE_EL2_TRACING +int kvm_hyp_trace_init(void); +#else +static inline int kvm_hyp_trace_init(void) { return 0; } +#endif +#endif diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 17d64a1e11e5..d089c107d9b7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -340,6 +340,9 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, bool may_block) { + if (kvm_vm_is_protected(kvm_s2_mmu_to_kvm(mmu))) + return; + __unmap_stage2_range(mmu, start, size, may_block); } @@ -878,9 +881,6 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) u64 mmfr0, mmfr1; u32 phys_shift; - if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) - return -EINVAL; - phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (is_protected_kvm_enabled()) { phys_shift = kvm_ipa_limit; @@ -1013,6 +1013,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t out_destroy_pgtable: kvm_stage2_destroy(pgt); + mmu->pgt = NULL; out_free_pgtable: kfree(pgt); return err; @@ -1400,10 +1401,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, */ static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long hva, kvm_pfn_t *pfnp, - phys_addr_t *ipap) + unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp) { kvm_pfn_t pfn = *pfnp; + gfn_t gfn = *gfnp; /* * Make sure the adjustment is done only for THP pages. Also make @@ -1419,7 +1420,8 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, if (sz < PMD_SIZE) return PAGE_SIZE; - *ipap &= PMD_MASK; + gfn &= ~(PTRS_PER_PMD - 1); + *gfnp = gfn; pfn &= ~(PTRS_PER_PMD - 1); *pfnp = pfn; @@ -1512,25 +1514,22 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) } } -static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, - void **memcache) +static void *get_mmu_memcache(struct kvm_vcpu *vcpu) { - int min_pages; - if (!is_protected_kvm_enabled()) - *memcache = &vcpu->arch.mmu_page_cache; + return &vcpu->arch.mmu_page_cache; else - *memcache = &vcpu->arch.pkvm_memcache; + return &vcpu->arch.pkvm_memcache; +} - if (!topup_memcache) - return 0; - - min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); +static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache) +{ + int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); if (!is_protected_kvm_enabled()) - return kvm_mmu_topup_memory_cache(*memcache, min_pages); + return kvm_mmu_topup_memory_cache(memcache, min_pages); - return topup_hyp_memcache(*memcache, min_pages); + return topup_hyp_memcache(memcache, min_pages); } /* @@ -1543,54 +1542,63 @@ static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, * TLB invalidation from the guest and used to limit the invalidation scope if a * TTL hint or a range isn't provided. */ -static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, - enum kvm_pgtable_prot *prot, - bool *writable) +static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested, + enum kvm_pgtable_prot prot) { - *writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_writable(nested)) + prot &= ~KVM_PGTABLE_PROT_W; if (!kvm_s2_trans_readable(nested)) - *prot &= ~KVM_PGTABLE_PROT_R; + prot &= ~KVM_PGTABLE_PROT_R; - *prot |= kvm_encode_nested_level(nested); + return prot | kvm_encode_nested_level(nested); } -static void adjust_nested_exec_perms(struct kvm *kvm, - struct kvm_s2_trans *nested, - enum kvm_pgtable_prot *prot) +static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm, + struct kvm_s2_trans *nested, + enum kvm_pgtable_prot prot) { if (!kvm_s2_trans_exec_el0(kvm, nested)) - *prot &= ~KVM_PGTABLE_PROT_UX; + prot &= ~KVM_PGTABLE_PROT_UX; if (!kvm_s2_trans_exec_el1(kvm, nested)) - *prot &= ~KVM_PGTABLE_PROT_PX; + prot &= ~KVM_PGTABLE_PROT_PX; + + return prot; } -static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_s2_trans *nested, - struct kvm_memory_slot *memslot, bool is_perm) +struct kvm_s2_fault_desc { + struct kvm_vcpu *vcpu; + phys_addr_t fault_ipa; + struct kvm_s2_trans *nested; + struct kvm_memory_slot *memslot; + unsigned long hva; +}; + +static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) { - bool write_fault, exec_fault, writable; + bool write_fault, exec_fault; enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; - struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; unsigned long mmu_seq; struct page *page; - struct kvm *kvm = vcpu->kvm; + struct kvm *kvm = s2fd->vcpu->kvm; void *memcache; kvm_pfn_t pfn; gfn_t gfn; int ret; - ret = prepare_mmu_memcache(vcpu, true, &memcache); + memcache = get_mmu_memcache(s2fd->vcpu); + ret = topup_mmu_memcache(s2fd->vcpu, memcache); if (ret) return ret; - if (nested) - gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; + if (s2fd->nested) + gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; else - gfn = fault_ipa >> PAGE_SHIFT; + gfn = s2fd->fault_ipa >> PAGE_SHIFT; - write_fault = kvm_is_write_fault(vcpu); - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); + write_fault = kvm_is_write_fault(s2fd->vcpu); + exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu); VM_WARN_ON_ONCE(write_fault && exec_fault); @@ -1598,26 +1606,24 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ smp_rmb(); - ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); + ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL); if (ret) { - kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, + kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE, write_fault, exec_fault, false); return ret; } - writable = !(memslot->flags & KVM_MEM_READONLY); - - if (nested) - adjust_nested_fault_perms(nested, &prot, &writable); - - if (writable) + if (!(s2fd->memslot->flags & KVM_MEM_READONLY)) prot |= KVM_PGTABLE_PROT_W; + if (s2fd->nested) + prot = adjust_nested_fault_perms(s2fd->nested, prot); + if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) prot |= KVM_PGTABLE_PROT_X; - if (nested) - adjust_nested_exec_perms(kvm, nested, &prot); + if (s2fd->nested) + prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot); kvm_fault_lock(kvm); if (mmu_invalidate_retry(kvm, mmu_seq)) { @@ -1625,85 +1631,122 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, goto out_unlock; } - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, __pfn_to_phys(pfn), prot, memcache, flags); out_unlock: - kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); kvm_fault_unlock(kvm); - if (writable && !ret) - mark_page_dirty_in_slot(kvm, memslot, gfn); + if ((prot & KVM_PGTABLE_PROT_W) && !ret) + mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn); return ret != -EAGAIN ? ret : 0; } -static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_s2_trans *nested, - struct kvm_memory_slot *memslot, unsigned long hva, - bool fault_is_perm) +struct kvm_s2_fault_vma_info { + unsigned long mmu_seq; + long vma_pagesize; + vm_flags_t vm_flags; + unsigned long max_map_size; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + bool device; + bool mte_allowed; + bool is_vma_cacheable; + bool map_writable; + bool map_non_cacheable; +}; + +static int pkvm_mem_abort(const struct kvm_s2_fault_desc *s2fd) { - int ret = 0; - bool topup_memcache; - bool write_fault, writable; - bool exec_fault, mte_allowed, is_vma_cacheable; - bool s2_force_noncacheable = false, vfio_allow_any_uc = false; - unsigned long mmu_seq; - phys_addr_t ipa = fault_ipa; + unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; + struct kvm_vcpu *vcpu = s2fd->vcpu; + struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + struct mm_struct *mm = current->mm; struct kvm *kvm = vcpu->kvm; - struct vm_area_struct *vma; - short vma_shift; - void *memcache; - gfn_t gfn; - kvm_pfn_t pfn; - bool logging_active = memslot_is_logging(memslot); - bool force_pte = logging_active; - long vma_pagesize, fault_granule; - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; - struct kvm_pgtable *pgt; + void *hyp_memcache; struct page *page; - vm_flags_t vm_flags; - enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; + int ret; - if (fault_is_perm) - fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); - write_fault = kvm_is_write_fault(vcpu); - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); - VM_WARN_ON_ONCE(write_fault && exec_fault); + hyp_memcache = get_mmu_memcache(vcpu); + ret = topup_mmu_memcache(vcpu, hyp_memcache); + if (ret) + return -ENOMEM; - /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. - */ - topup_memcache = !fault_is_perm || (logging_active && write_fault); - ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); + ret = account_locked_vm(mm, 1, true); if (ret) return ret; - /* - * Let's check if we will get back a huge page backed by hugetlbfs, or - * get block mapping for device MMIO region. - */ - mmap_read_lock(current->mm); - vma = vma_lookup(current->mm, hva); - if (unlikely(!vma)) { - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); - mmap_read_unlock(current->mm); - return -EFAULT; + mmap_read_lock(mm); + ret = pin_user_pages(s2fd->hva, 1, flags, &page); + mmap_read_unlock(mm); + + if (ret == -EHWPOISON) { + kvm_send_hwpoison_signal(s2fd->hva, PAGE_SHIFT); + ret = 0; + goto dec_account; + } else if (ret != 1) { + ret = -EFAULT; + goto dec_account; + } else if (!folio_test_swapbacked(page_folio(page))) { + /* + * We really can't deal with page-cache pages returned by GUP + * because (a) we may trigger writeback of a page for which we + * no longer have access and (b) page_mkclean() won't find the + * stage-2 mapping in the rmap so we can get out-of-whack with + * the filesystem when marking the page dirty during unpinning + * (see cc5095747edf ("ext4: don't BUG if someone dirty pages + * without asking ext4 first")). + * + * Ideally we'd just restrict ourselves to anonymous pages, but + * we also want to allow memfd (i.e. shmem) pages, so check for + * pages backed by swap in the knowledge that the GUP pin will + * prevent try_to_unmap() from succeeding. + */ + ret = -EIO; + goto unpin; } - if (force_pte) + write_lock(&kvm->mmu_lock); + ret = pkvm_pgtable_stage2_map(pgt, s2fd->fault_ipa, PAGE_SIZE, + page_to_phys(page), KVM_PGTABLE_PROT_RWX, + hyp_memcache, 0); + write_unlock(&kvm->mmu_lock); + if (ret) { + if (ret == -EAGAIN) + ret = 0; + goto unpin; + } + + return 0; +unpin: + unpin_user_pages(&page, 1); +dec_account: + account_locked_vm(mm, 1, false); + return ret; +} + +static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, + struct kvm_s2_fault_vma_info *s2vi, + struct vm_area_struct *vma) +{ + short vma_shift; + + if (memslot_is_logging(s2fd->memslot)) { + s2vi->max_map_size = PAGE_SIZE; vma_shift = PAGE_SHIFT; - else - vma_shift = get_vma_page_shift(vma, hva); + } else { + s2vi->max_map_size = PUD_SIZE; + vma_shift = get_vma_page_shift(vma, s2fd->hva); + } switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE)) break; fallthrough; #endif @@ -1711,12 +1754,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_shift = PMD_SHIFT; fallthrough; case PMD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE)) break; fallthrough; case CONT_PTE_SHIFT: vma_shift = PAGE_SHIFT; - force_pte = true; + s2vi->max_map_size = PAGE_SIZE; fallthrough; case PAGE_SHIFT: break; @@ -1724,21 +1767,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); } - vma_pagesize = 1UL << vma_shift; - - if (nested) { + if (s2fd->nested) { unsigned long max_map_size; - max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; - - ipa = kvm_s2_trans_output(nested); + max_map_size = min(s2vi->max_map_size, PUD_SIZE); /* * If we're about to create a shadow stage 2 entry, then we * can only create a block mapping if the guest stage 2 page * table uses at least as big a mapping. */ - max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size); /* * Be careful that if the mapping size falls between @@ -1749,30 +1788,46 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) max_map_size = PAGE_SIZE; - force_pte = (max_map_size == PAGE_SIZE); - vma_pagesize = min_t(long, vma_pagesize, max_map_size); - vma_shift = __ffs(vma_pagesize); + s2vi->max_map_size = max_map_size; + vma_shift = min_t(short, vma_shift, __ffs(max_map_size)); } + return vma_shift; +} + +static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd) +{ + return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); +} + +static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd, + struct kvm_s2_fault_vma_info *s2vi) +{ + struct vm_area_struct *vma; + struct kvm *kvm = s2fd->vcpu->kvm; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, s2fd->hva); + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva); + mmap_read_unlock(current->mm); + return -EFAULT; + } + + s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma)); + /* * Both the canonical IPA and fault IPA must be aligned to the * mapping size to ensure we find the right PFN and lay down the * mapping in the right place. */ - fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize); - ipa = ALIGN_DOWN(ipa, vma_pagesize); + s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; - gfn = ipa >> PAGE_SHIFT; - mte_allowed = kvm_vma_mte_allowed(vma); + s2vi->mte_allowed = kvm_vma_mte_allowed(vma); - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; + s2vi->vm_flags = vma->vm_flags; - vm_flags = vma->vm_flags; - - is_vma_cacheable = kvm_vma_is_cacheable(vma); - - /* Don't use the VMA after the unlock -- it may have vanished */ - vma = NULL; + s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma); /* * Read mmu_invalidate_seq so that KVM can detect if the results of @@ -1782,24 +1837,50 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - mmu_seq = kvm->mmu_invalidate_seq; + s2vi->mmu_seq = kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, - &writable, &page); - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(hva, vma_shift); - return 0; - } - if (is_error_noslot_pfn(pfn)) + return 0; +} + +static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd, + const struct kvm_s2_fault_vma_info *s2vi) +{ + phys_addr_t ipa; + + if (!s2fd->nested) + return s2vi->gfn; + + ipa = kvm_s2_trans_output(s2fd->nested); + return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; +} + +static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd, + struct kvm_s2_fault_vma_info *s2vi) +{ + int ret; + + ret = kvm_s2_fault_get_vma_info(s2fd, s2vi); + if (ret) + return ret; + + s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi), + kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0, + &s2vi->map_writable, &s2vi->page); + if (unlikely(is_error_noslot_pfn(s2vi->pfn))) { + if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize)); + return 0; + } return -EFAULT; + } /* * Check if this is non-struct page memory PFN, and cannot support * CMOs. It could potentially be unsafe to access as cacheable. */ - if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { - if (is_vma_cacheable) { + if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) { + if (s2vi->is_vma_cacheable) { /* * Whilst the VMA owner expects cacheable mapping to this * PFN, hardware also has to support the FWB and CACHE DIC @@ -1812,8 +1893,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * S2FWB and CACHE DIC are mandatory to avoid the need for * cache maintenance. */ - if (!kvm_supports_cacheable_pfnmap()) - ret = -EFAULT; + if (!kvm_supports_cacheable_pfnmap()) { + kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false); + return -EFAULT; + } } else { /* * If the page was identified as device early by looking at @@ -1825,21 +1908,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * In both cases, we don't let transparent_hugepage_adjust() * change things at the last minute. */ - s2_force_noncacheable = true; + s2vi->map_non_cacheable = true; } - } else if (logging_active && !write_fault) { - /* - * Only actually map the page as writable if this was a write - * fault. - */ - writable = false; + + s2vi->device = true; } - if (exec_fault && s2_force_noncacheable) - ret = -ENOEXEC; + return 1; +} - if (ret) - goto out_put_page; +static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd, + const struct kvm_s2_fault_vma_info *s2vi, + enum kvm_pgtable_prot *prot) +{ + struct kvm *kvm = s2fd->vcpu->kvm; + + if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable) + return -ENOEXEC; /* * Guest performs atomic/exclusive operations on memory with unsupported @@ -1847,99 +1932,167 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * and trigger the exception here. Since the memslot is valid, inject * the fault back to the guest. */ - if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(vcpu))) { - kvm_inject_dabt_excl_atomic(vcpu, kvm_vcpu_get_hfar(vcpu)); - ret = 1; - goto out_put_page; + if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) { + kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu)); + return 1; } - if (nested) - adjust_nested_fault_perms(nested, &prot, &writable); + *prot = KVM_PGTABLE_PROT_R; + + if (s2vi->map_writable && (s2vi->device || + !memslot_is_logging(s2fd->memslot) || + kvm_is_write_fault(s2fd->vcpu))) + *prot |= KVM_PGTABLE_PROT_W; + + if (s2fd->nested) + *prot = adjust_nested_fault_perms(s2fd->nested, *prot); + + if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu)) + *prot |= KVM_PGTABLE_PROT_X; + + if (s2vi->map_non_cacheable) + *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ? + KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE; + else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) + *prot |= KVM_PGTABLE_PROT_X; + + if (s2fd->nested) + *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot); + + if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) { + /* Check the VMM hasn't introduced a new disallowed VMA */ + if (!s2vi->mte_allowed) + return -EFAULT; + } + + return 0; +} + +static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd, + const struct kvm_s2_fault_vma_info *s2vi, + enum kvm_pgtable_prot prot, + void *memcache) +{ + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; + bool writable = prot & KVM_PGTABLE_PROT_W; + struct kvm *kvm = s2fd->vcpu->kvm; + struct kvm_pgtable *pgt; + long perm_fault_granule; + long mapping_size; + kvm_pfn_t pfn; + gfn_t gfn; + int ret; kvm_fault_lock(kvm); - pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) { - ret = -EAGAIN; + pgt = s2fd->vcpu->arch.hw_mmu->pgt; + ret = -EAGAIN; + if (mmu_invalidate_retry(kvm, s2vi->mmu_seq)) goto out_unlock; - } + + perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ? + kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0); + mapping_size = s2vi->vma_pagesize; + pfn = s2vi->pfn; + gfn = s2vi->gfn; /* * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { - if (fault_is_perm && fault_granule > PAGE_SIZE) - vma_pagesize = fault_granule; - else - vma_pagesize = transparent_hugepage_adjust(kvm, memslot, - hva, &pfn, - &fault_ipa); - - if (vma_pagesize < 0) { - ret = vma_pagesize; - goto out_unlock; - } - } - - if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { - /* Check the VMM hasn't introduced a new disallowed VMA */ - if (mte_allowed) { - sanitise_mte_tags(kvm, pfn, vma_pagesize); + if (mapping_size == PAGE_SIZE && + !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) { + if (perm_fault_granule > PAGE_SIZE) { + mapping_size = perm_fault_granule; } else { - ret = -EFAULT; - goto out_unlock; + mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot, + s2fd->hva, &pfn, + &gfn); + if (mapping_size < 0) { + ret = mapping_size; + goto out_unlock; + } } } - if (writable) - prot |= KVM_PGTABLE_PROT_W; - - if (exec_fault) - prot |= KVM_PGTABLE_PROT_X; - - if (s2_force_noncacheable) { - if (vfio_allow_any_uc) - prot |= KVM_PGTABLE_PROT_NORMAL_NC; - else - prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { - prot |= KVM_PGTABLE_PROT_X; - } - - if (nested) - adjust_nested_exec_perms(kvm, nested, &prot); + if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) + sanitise_mte_tags(kvm, pfn, mapping_size); /* * Under the premise of getting a FSC_PERM fault, we just need to relax - * permissions only if vma_pagesize equals fault_granule. Otherwise, + * permissions only if mapping_size equals perm_fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_is_perm && vma_pagesize == fault_granule) { + if (mapping_size == perm_fault_granule) { /* * Drop the SW bits in favour of those stored in the * PTE, which will be preserved. */ prot &= ~KVM_NV_GUEST_MAP_SZ; - ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn), + prot, flags); } else { - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, - __pfn_to_phys(pfn), prot, - memcache, flags); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size, + __pfn_to_phys(pfn), prot, + memcache, flags); } out_unlock: - kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable); kvm_fault_unlock(kvm); - /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) - mark_page_dirty_in_slot(kvm, memslot, gfn); + /* + * Mark the page dirty only if the fault is handled successfully, + * making sure we adjust the canonical IPA if the mapping size has + * been updated (via a THP upgrade, for example). + */ + if (writable && !ret) { + phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi)); + ipa &= ~(mapping_size - 1); + mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa)); + } - return ret != -EAGAIN ? ret : 0; + if (ret != -EAGAIN) + return ret; + return 0; +} -out_put_page: - kvm_release_page_unused(page); - return ret; +static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd) +{ + bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); + struct kvm_s2_fault_vma_info s2vi = {}; + enum kvm_pgtable_prot prot; + void *memcache; + int ret; + + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + memcache = get_mmu_memcache(s2fd->vcpu); + if (!perm_fault || (memslot_is_logging(s2fd->memslot) && + kvm_is_write_fault(s2fd->vcpu))) { + ret = topup_mmu_memcache(s2fd->vcpu, memcache); + if (ret) + return ret; + } + + /* + * Let's check if we will get back a huge page backed by hugetlbfs, or + * get block mapping for device MMIO region. + */ + ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi); + if (ret != 1) + return ret; + + ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot); + if (ret) { + kvm_release_page_unused(s2vi.page); + return ret; + } + + return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache); } /* Resolve the access fault by making the page young again. */ @@ -2202,15 +2355,27 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) goto out_unlock; } - VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && - !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); + const struct kvm_s2_fault_desc s2fd = { + .vcpu = vcpu, + .fault_ipa = fault_ipa, + .nested = nested, + .memslot = memslot, + .hva = hva, + }; + + if (kvm_vm_is_protected(vcpu->kvm)) { + ret = pkvm_mem_abort(&s2fd); + } else { + VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && + !write_fault && + !kvm_vcpu_trap_is_exec_fault(vcpu)); + + if (kvm_slot_has_gmem(memslot)) + ret = gmem_abort(&s2fd); + else + ret = user_mem_abort(&s2fd); + } - if (kvm_slot_has_gmem(memslot)) - ret = gmem_abort(vcpu, fault_ipa, nested, memslot, - esr_fsc_is_permission_fault(esr)); - else - ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, - esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: @@ -2223,7 +2388,7 @@ out_unlock: bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - if (!kvm->arch.mmu.pgt) + if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) return false; __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, @@ -2238,7 +2403,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { u64 size = (range->end - range->start) << PAGE_SHIFT; - if (!kvm->arch.mmu.pgt) + if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) return false; return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, @@ -2254,7 +2419,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { u64 size = (range->end - range->start) << PAGE_SHIFT; - if (!kvm->arch.mmu.pgt) + if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm)) return false; return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, @@ -2411,6 +2576,19 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva_t hva, reg_end; int ret = 0; + if (kvm_vm_is_protected(kvm)) { + /* Cannot modify memslots once a pVM has run. */ + if (pkvm_hyp_vm_is_created(kvm) && + (change == KVM_MR_DELETE || change == KVM_MR_MOVE)) { + return -EPERM; + } + + if (new && + new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) { + return -EPERM; + } + } + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && change != KVM_MR_FLAGS_ONLY) return 0; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 2c43097248b2..883b6c1008fb 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -735,8 +735,10 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; /* Make sure we don't forget to do the laundry */ - if (kvm_s2_mmu_valid(s2_mmu)) + if (kvm_s2_mmu_valid(s2_mmu)) { + kvm_nested_s2_ptdump_remove_debugfs(s2_mmu); s2_mmu->pending_unmap = true; + } /* * The virtual VMID (modulo CnP) will be used as a key when matching @@ -750,6 +752,8 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2); s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM; + kvm_nested_s2_ptdump_create_debugfs(s2_mmu); + out: atomic_inc(&s2_mmu->refcnt); @@ -1558,6 +1562,11 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) ID_AA64PFR1_EL1_MTE); break; + case SYS_ID_AA64PFR2_EL1: + /* GICv5 is not yet supported for NV */ + val &= ~ID_AA64PFR2_EL1_GCIE; + break; + case SYS_ID_AA64MMFR0_EL1: /* Hide ExS, Secure Memory */ val &= ~(ID_AA64MMFR0_EL1_EXS | diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index d7a0f69a9982..053e4f733e4b 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -88,7 +88,7 @@ void __init kvm_hyp_reserve(void) static void __pkvm_destroy_hyp_vm(struct kvm *kvm) { if (pkvm_hyp_vm_is_created(kvm)) { - WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, + WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, kvm->arch.pkvm.handle)); } else if (kvm->arch.pkvm.handle) { /* @@ -192,10 +192,16 @@ int pkvm_create_hyp_vm(struct kvm *kvm) { int ret = 0; + /* + * Synchronise with kvm_arch_prepare_memory_region(), as we + * prevent memslot modifications on a pVM that has been run. + */ + mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->arch.config_lock); if (!pkvm_hyp_vm_is_created(kvm)) ret = __pkvm_create_hyp_vm(kvm); mutex_unlock(&kvm->arch.config_lock); + mutex_unlock(&kvm->slots_lock); return ret; } @@ -219,9 +225,10 @@ void pkvm_destroy_hyp_vm(struct kvm *kvm) mutex_unlock(&kvm->arch.config_lock); } -int pkvm_init_host_vm(struct kvm *kvm) +int pkvm_init_host_vm(struct kvm *kvm, unsigned long type) { int ret; + bool protected = type & KVM_VM_TYPE_ARM_PROTECTED; if (pkvm_hyp_vm_is_created(kvm)) return -EINVAL; @@ -236,6 +243,11 @@ int pkvm_init_host_vm(struct kvm *kvm) return ret; kvm->arch.pkvm.handle = ret; + kvm->arch.pkvm.is_protected = protected; + if (protected) { + pr_warn_once("kvm: protected VMs are experimental and for development only, tainting kernel\n"); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } return 0; } @@ -322,15 +334,38 @@ int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, return 0; } -static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end) +static int __pkvm_pgtable_stage2_reclaim(struct kvm_pgtable *pgt, u64 start, u64 end) { struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); pkvm_handle_t handle = kvm->arch.pkvm.handle; struct pkvm_mapping *mapping; int ret; - if (!handle) - return 0; + for_each_mapping_in_range_safe(pgt, start, end, mapping) { + struct page *page; + + ret = kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page, + handle, mapping->gfn); + if (WARN_ON(ret)) + continue; + + page = pfn_to_page(mapping->pfn); + WARN_ON_ONCE(mapping->nr_pages != 1); + unpin_user_pages_dirty_lock(&page, 1, true); + account_locked_vm(current->mm, 1, false); + pkvm_mapping_remove(mapping, &pgt->pkvm_mappings); + kfree(mapping); + } + + return 0; +} + +static int __pkvm_pgtable_stage2_unshare(struct kvm_pgtable *pgt, u64 start, u64 end) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret; for_each_mapping_in_range_safe(pgt, start, end, mapping) { ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn, @@ -347,7 +382,21 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, u64 addr, u64 size) { - __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + + if (!handle) + return; + + if (pkvm_hyp_vm_is_created(kvm) && !kvm->arch.pkvm.is_dying) { + WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, handle)); + kvm->arch.pkvm.is_dying = true; + } + + if (kvm_vm_is_protected(kvm)) + __pkvm_pgtable_stage2_reclaim(pgt, addr, addr + size); + else + __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size); } void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) @@ -365,31 +414,58 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_hyp_memcache *cache = mc; u64 gfn = addr >> PAGE_SHIFT; u64 pfn = phys >> PAGE_SHIFT; + u64 end = addr + size; int ret; - if (size != PAGE_SIZE && size != PMD_SIZE) - return -EINVAL; - lockdep_assert_held_write(&kvm->mmu_lock); + mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, end - 1); - /* - * Calling stage2_map() on top of existing mappings is either happening because of a race - * with another vCPU, or because we're changing between page and block mappings. As per - * user_mem_abort(), same-size permission faults are handled in the relax_perms() path. - */ - mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1); - if (mapping) { - if (size == (mapping->nr_pages * PAGE_SIZE)) - return -EAGAIN; + if (kvm_vm_is_protected(kvm)) { + /* Protected VMs are mapped using RWX page-granular mappings */ + if (WARN_ON_ONCE(size != PAGE_SIZE)) + return -EINVAL; - /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */ - ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); - if (ret) - return ret; - mapping = NULL; + if (WARN_ON_ONCE(prot != KVM_PGTABLE_PROT_RWX)) + return -EINVAL; + + /* + * We either raced with another vCPU or the guest PTE + * has been poisoned by an erroneous host access. + */ + if (mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_vcpu_in_poison_fault); + return ret ? -EFAULT : -EAGAIN; + } + + ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest, pfn, gfn); + } else { + if (WARN_ON_ONCE(size != PAGE_SIZE && size != PMD_SIZE)) + return -EINVAL; + + /* + * We either raced with another vCPU or we're changing between + * page and block mappings. As per user_mem_abort(), same-size + * permission faults are handled in the relax_perms() path. + */ + if (mapping) { + if (size == (mapping->nr_pages * PAGE_SIZE)) + return -EAGAIN; + + /* + * Remove _any_ pkvm_mapping overlapping with the range, + * bigger or smaller. + */ + ret = __pkvm_pgtable_stage2_unshare(pgt, addr, end); + if (ret) + return ret; + + mapping = NULL; + } + + ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, + size / PAGE_SIZE, prot); } - ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot); if (WARN_ON(ret)) return ret; @@ -404,9 +480,14 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { - lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock); + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); - return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); + if (WARN_ON(kvm_vm_is_protected(kvm))) + return -EPERM; + + lockdep_assert_held_write(&kvm->mmu_lock); + + return __pkvm_pgtable_stage2_unshare(pgt, addr, addr + size); } int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) @@ -416,6 +497,9 @@ int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) struct pkvm_mapping *mapping; int ret = 0; + if (WARN_ON(kvm_vm_is_protected(kvm))) + return -EPERM; + lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn, @@ -447,6 +531,9 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 struct pkvm_mapping *mapping; bool young = false; + if (WARN_ON(kvm_vm_is_protected(kvm))) + return false; + lockdep_assert_held(&kvm->mmu_lock); for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, @@ -458,12 +545,18 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { + if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu)))) + return -EPERM; + return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); } void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags) { + if (WARN_ON(kvm_vm_is_protected(kvm_s2_mmu_to_kvm(pgt->mmu)))) + return; + WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); } @@ -485,3 +578,15 @@ int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, WARN_ON_ONCE(1); return -EINVAL; } + +/* + * Forcefully reclaim a page from the guest, zeroing its contents and + * poisoning the stage-2 pte so that pages can no longer be mapped at + * the same IPA. The page remains pinned until the guest is destroyed. + */ +bool pkvm_force_reclaim_guest_page(phys_addr_t phys) +{ + int ret = kvm_call_hyp_nvhe(__pkvm_force_reclaim_guest_page, phys); + + return !ret || ret == -EAGAIN; +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 93cc9bbb5cec..e1860acae641 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -939,7 +939,8 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) * number against the dimensions of the vgic and make sure * it's valid. */ - if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) + if (!irq_is_ppi(vcpu->kvm, irq) && + !vgic_valid_spi(vcpu->kvm, irq)) return -EINVAL; } else if (kvm_arm_pmu_irq_initialized(vcpu)) { return -EINVAL; @@ -961,8 +962,13 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) if (!vgic_initialized(vcpu->kvm)) return -ENODEV; - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; + if (!kvm_arm_pmu_irq_initialized(vcpu)) { + if (!vgic_is_v5(vcpu->kvm)) + return -ENXIO; + + /* Use the architected irq number for GICv5. */ + vcpu->arch.pmu.irq_num = KVM_ARMV8_PMU_GICV5_IRQ; + } ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, &vcpu->arch.pmu); @@ -987,11 +993,15 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) unsigned long i; struct kvm_vcpu *vcpu; + /* On GICv5, the PMUIRQ is architecturally mandated to be PPI 23 */ + if (vgic_is_v5(kvm) && irq != KVM_ARMV8_PMU_GICV5_IRQ) + return false; + kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_arm_pmu_irq_initialized(vcpu)) continue; - if (irq_is_ppi(irq)) { + if (irq_is_ppi(vcpu->kvm, irq)) { if (vcpu->arch.pmu.irq_num != irq) return false; } else { @@ -1142,7 +1152,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) return -EFAULT; /* The PMU overflow interrupt can be a PPI or a valid SPI. */ - if (!(irq_is_ppi(irq) || irq_is_spi(irq))) + if (!(irq_is_ppi(vcpu->kvm, irq) || irq_is_spi(vcpu->kvm, irq))) return -EINVAL; if (!pmu_irq_is_valid(kvm, irq)) diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c index 6a8836207a79..c9140e22abcf 100644 --- a/arch/arm64/kvm/ptdump.c +++ b/arch/arm64/kvm/ptdump.c @@ -10,19 +10,20 @@ #include #include +#include #include #include #include #define MARKERS_LEN 2 #define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1) +#define S2FNAMESZ sizeof("0x0123456789abcdef-0x0123456789abcdef-s2-disabled") struct kvm_ptdump_guest_state { - struct kvm *kvm; + struct kvm_s2_mmu *mmu; struct ptdump_pg_state parser_state; struct addr_marker ipa_marker[MARKERS_LEN]; struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS]; - struct ptdump_range range[MARKERS_LEN]; }; static const struct ptdump_prot_bits stage2_pte_bits[] = { @@ -112,10 +113,9 @@ static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl) return 0; } -static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm_s2_mmu *mmu) { struct kvm_ptdump_guest_state *st; - struct kvm_s2_mmu *mmu = &kvm->arch.mmu; struct kvm_pgtable *pgtable = mmu->pgt; int ret; @@ -131,17 +131,8 @@ static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) st->ipa_marker[0].name = "Guest IPA"; st->ipa_marker[1].start_address = BIT(pgtable->ia_bits); - st->range[0].end = BIT(pgtable->ia_bits); - - st->kvm = kvm; - st->parser_state = (struct ptdump_pg_state) { - .marker = &st->ipa_marker[0], - .level = -1, - .pg_level = &st->level[0], - .ptdump.range = &st->range[0], - .start_address = 0, - }; + st->mmu = mmu; return st; } @@ -149,16 +140,20 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) { int ret; struct kvm_ptdump_guest_state *st = m->private; - struct kvm *kvm = st->kvm; - struct kvm_s2_mmu *mmu = &kvm->arch.mmu; - struct ptdump_pg_state *parser_state = &st->parser_state; + struct kvm_s2_mmu *mmu = st->mmu; + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) { .cb = kvm_ptdump_visitor, - .arg = parser_state, + .arg = &st->parser_state, .flags = KVM_PGTABLE_WALK_LEAF, }; - parser_state->seq = m; + st->parser_state = (struct ptdump_pg_state) { + .marker = &st->ipa_marker[0], + .level = -1, + .pg_level = &st->level[0], + .seq = m, + }; write_lock(&kvm->mmu_lock); ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker); @@ -169,14 +164,15 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) static int kvm_ptdump_guest_open(struct inode *m, struct file *file) { - struct kvm *kvm = m->i_private; + struct kvm_s2_mmu *mmu = m->i_private; + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); struct kvm_ptdump_guest_state *st; int ret; if (!kvm_get_kvm_safe(kvm)) return -ENOENT; - st = kvm_ptdump_parser_create(kvm); + st = kvm_ptdump_parser_create(mmu); if (IS_ERR(st)) { ret = PTR_ERR(st); goto err_with_kvm_ref; @@ -194,7 +190,7 @@ err_with_kvm_ref: static int kvm_ptdump_guest_close(struct inode *m, struct file *file) { - struct kvm *kvm = m->i_private; + struct kvm *kvm = kvm_s2_mmu_to_kvm(m->i_private); void *st = ((struct seq_file *)file->private_data)->private; kfree(st); @@ -229,14 +225,15 @@ static int kvm_pgtable_levels_show(struct seq_file *m, void *unused) static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file, int (*show)(struct seq_file *, void *)) { - struct kvm *kvm = m->i_private; + struct kvm_s2_mmu *mmu = m->i_private; + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); struct kvm_pgtable *pgtable; int ret; if (!kvm_get_kvm_safe(kvm)) return -ENOENT; - pgtable = kvm->arch.mmu.pgt; + pgtable = mmu->pgt; ret = single_open(file, show, pgtable); if (ret < 0) @@ -256,7 +253,7 @@ static int kvm_pgtable_levels_open(struct inode *m, struct file *file) static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file) { - struct kvm *kvm = m->i_private; + struct kvm *kvm = kvm_s2_mmu_to_kvm(m->i_private); kvm_put_kvm(kvm); return single_release(m, file); @@ -276,12 +273,36 @@ static const struct file_operations kvm_pgtable_levels_fops = { .release = kvm_pgtable_debugfs_close, }; +void kvm_nested_s2_ptdump_create_debugfs(struct kvm_s2_mmu *mmu) +{ + struct dentry *dent; + char file_name[S2FNAMESZ]; + + snprintf(file_name, sizeof(file_name), "0x%016llx-0x%016llx-s2-%sabled", + mmu->tlb_vttbr, + mmu->tlb_vtcr, + mmu->nested_stage2_enabled ? "en" : "dis"); + + dent = debugfs_create_file(file_name, 0400, + mmu->arch->debugfs_nv_dentry, mmu, + &kvm_ptdump_guest_fops); + + mmu->shadow_pt_debugfs_dentry = dent; +} + +void kvm_nested_s2_ptdump_remove_debugfs(struct kvm_s2_mmu *mmu) +{ + debugfs_remove(mmu->shadow_pt_debugfs_dentry); +} + void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) { debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry, - kvm, &kvm_ptdump_guest_fops); - debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm, - &kvm_pgtable_range_fops); + &kvm->arch.mmu, &kvm_ptdump_guest_fops); + debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, + &kvm->arch.mmu, &kvm_pgtable_range_fops); debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry, - kvm, &kvm_pgtable_levels_fops); + &kvm->arch.mmu, &kvm_pgtable_levels_fops); + if (cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) + kvm->arch.debugfs_nv_dentry = debugfs_create_dir("nested", kvm->debugfs_dentry); } diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index af5eec681127..9724c320126b 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -197,7 +197,7 @@ static void hyp_dump_backtrace(unsigned long hyp_offset) kvm_nvhe_dump_backtrace_end(); } -#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#ifdef CONFIG_PKVM_STACKTRACE DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); @@ -225,12 +225,12 @@ static void pkvm_dump_backtrace(unsigned long hyp_offset) kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); kvm_nvhe_dump_backtrace_end(); } -#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +#else /* !CONFIG_PKVM_STACKTRACE */ static void pkvm_dump_backtrace(unsigned long hyp_offset) { - kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); + kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PKVM_STACKTRACE\n"); } -#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ +#endif /* CONFIG_PKVM_STACKTRACE */ /* * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1b4cacb6e918..6a96cb7ba9a3 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -681,6 +681,91 @@ static bool access_gic_dir(struct kvm_vcpu *vcpu, return true; } +static bool access_gicv5_idr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return undef_access(vcpu, p, r); + + /* + * Expose KVM's priority- and ID-bits to the guest, but not GCIE_LEGACY. + * + * Note: for GICv5 the mimic the way that the num_pri_bits and + * num_id_bits fields are used with GICv3: + * - num_pri_bits stores the actual number of priority bits, whereas the + * register field stores num_pri_bits - 1. + * - num_id_bits stores the raw field value, which is 0b0000 for 16 bits + * and 0b0001 for 24 bits. + */ + p->regval = FIELD_PREP(ICC_IDR0_EL1_PRI_BITS, vcpu->arch.vgic_cpu.num_pri_bits - 1) | + FIELD_PREP(ICC_IDR0_EL1_ID_BITS, vcpu->arch.vgic_cpu.num_id_bits); + + return true; +} + +static bool access_gicv5_iaffid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return undef_access(vcpu, p, r); + + /* + * For GICv5 VMs, the IAFFID value is the same as the VPE ID. The VPE ID + * is the same as the VCPU's ID. + */ + p->regval = FIELD_PREP(ICC_IAFFIDR_EL1_IAFFID, vcpu->vcpu_id); + + return true; +} + +static bool access_gicv5_ppi_enabler(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + unsigned long *mask = vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask; + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + int i; + + /* We never expect to get here with a read! */ + if (WARN_ON_ONCE(!p->is_write)) + return undef_access(vcpu, p, r); + + /* + * If we're only handling architected PPIs and the guest writes to the + * enable for the non-architected PPIs, we just return as there's + * nothing to do at all. We don't even allocate the storage for them in + * this case. + */ + if (VGIC_V5_NR_PRIVATE_IRQS == 64 && p->Op2 % 2) + return true; + + /* + * Merge the raw guest write into out bitmap at an offset of either 0 or + * 64, then and it with our PPI mask. + */ + bitmap_write(cpu_if->vgic_ppi_enabler, p->regval, 64 * (p->Op2 % 2), 64); + bitmap_and(cpu_if->vgic_ppi_enabler, cpu_if->vgic_ppi_enabler, mask, + VGIC_V5_NR_PRIVATE_IRQS); + + /* + * Sync the change in enable states to the vgic_irqs. We consider all + * PPIs as we don't expose many to the guest. + */ + for_each_set_bit(i, mask, VGIC_V5_NR_PRIVATE_IRQS) { + u32 intid = vgic_v5_make_ppi(i); + struct vgic_irq *irq; + + irq = vgic_get_vcpu_irq(vcpu, intid); + + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) + irq->enabled = test_bit(i, cpu_if->vgic_ppi_enabler); + + vgic_put_irq(vcpu->kvm, irq); + } + + return true; +} + static bool trap_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -1758,6 +1843,7 @@ static u8 pmuver_to_perfmon(u8 pmuver) static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val); static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); /* Read a sanitised cpufeature ID register by sys_reg_desc */ @@ -1783,10 +1869,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = sanitise_id_aa64pfr1_el1(vcpu, val); break; case SYS_ID_AA64PFR2_EL1: - val &= ID_AA64PFR2_EL1_FPMR | - (kvm_has_mte(vcpu->kvm) ? - ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY : - 0); + val = sanitise_id_aa64pfr2_el1(vcpu, val); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1985,7 +2068,7 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP); } - if (vgic_is_v3(vcpu->kvm)) { + if (vgic_host_has_gicv3()) { val &= ~ID_AA64PFR0_EL1_GIC_MASK; val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); } @@ -2027,6 +2110,23 @@ static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val) return val; } +static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + val &= ID_AA64PFR2_EL1_FPMR | + ID_AA64PFR2_EL1_MTEFAR | + ID_AA64PFR2_EL1_MTESTOREONLY; + + if (!kvm_has_mte(vcpu->kvm)) { + val &= ~ID_AA64PFR2_EL1_MTEFAR; + val &= ~ID_AA64PFR2_EL1_MTESTOREONLY; + } + + if (vgic_host_has_gicv5()) + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP); + + return val; +} + static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); @@ -2177,14 +2277,6 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val))) return -EINVAL; - /* - * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then - * we support GICv3. Fail attempts to do anything but set that to IMP. - */ - if (vgic_is_v3_compat(vcpu->kvm) && - FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP) - return -EINVAL; - return set_id_reg(vcpu, rd, user_val); } @@ -2224,6 +2316,12 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, user_val); } +static int set_id_aa64pfr2_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + return set_id_reg(vcpu, rd, user_val); +} + /* * Allow userspace to de-feature a stage-2 translation granule but prevent it * from claiming the impossible. @@ -3205,10 +3303,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR1_EL1_RES0 | ID_AA64PFR1_EL1_MPAM_frac | ID_AA64PFR1_EL1_MTE)), - ID_WRITABLE(ID_AA64PFR2_EL1, - ID_AA64PFR2_EL1_FPMR | - ID_AA64PFR2_EL1_MTEFAR | - ID_AA64PFR2_EL1_MTESTOREONLY), + ID_FILTERED(ID_AA64PFR2_EL1, id_aa64pfr2_el1, + (ID_AA64PFR2_EL1_FPMR | + ID_AA64PFR2_EL1_MTEFAR | + ID_AA64PFR2_EL1_MTESTOREONLY | + ID_AA64PFR2_EL1_GCIE)), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), @@ -3391,6 +3490,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IDR0_EL1), access_gicv5_idr0 }, + { SYS_DESC(SYS_ICC_IAFFIDR_EL1), access_gicv5_iaffid }, + { SYS_DESC(SYS_ICC_PPI_ENABLER0_EL1), access_gicv5_ppi_enabler }, + { SYS_DESC(SYS_ICC_PPI_ENABLER1_EL1), access_gicv5_ppi_enabler }, { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, @@ -5647,6 +5750,8 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) compute_fgu(kvm, HFGRTR2_GROUP); compute_fgu(kvm, HFGITR2_GROUP); compute_fgu(kvm, HDFGRTR2_GROUP); + compute_fgu(kvm, ICH_HFGRTR_GROUP); + compute_fgu(kvm, ICH_HFGITR_GROUP); set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: @@ -5667,25 +5772,60 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) guard(mutex)(&kvm->arch.config_lock); - /* - * This hacks into the ID registers, so only perform it when the - * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream. - */ - if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) { - u64 val; - - val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; - kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); - val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; - kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); - } - if (vcpu_has_nv(vcpu)) { int ret = kvm_init_nv_sysregs(vcpu); if (ret) return ret; } + if (kvm_vm_has_ran_once(kvm)) + return 0; + + /* + * This hacks into the ID registers, so only perform it when the + * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream. + */ + if (!irqchip_in_kernel(kvm)) { + u64 val; + + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); + } else { + /* + * Certain userspace software - QEMU - samples the system + * register state without creating an irqchip, then blindly + * restores the state prior to running the final guest. This + * means that it restores the virtualization & emulation + * capabilities of the host system, rather than something that + * reflects the final guest state. Moreover, it checks that the + * state was "correctly" restored (i.e., verbatim), bailing if + * it isn't, so masking off invalid state isn't an option. + * + * On GICv5 hardware that supports FEAT_GCIE_LEGACY we can run + * both GICv3- and GICv5-based guests. Therefore, we initially + * present both ID_AA64PFR0.GIC and ID_AA64PFR2.GCIE as IMP to + * reflect that userspace can create EITHER a vGICv3 or a + * vGICv5. This is an architecturally invalid combination, of + * course. Once an in-kernel GIC is created, the sysreg state is + * updated to reflect the actual, valid configuration. + * + * Setting both the GIC and GCIE features to IMP unsurprisingly + * results in guests falling over, and hence we need to fix up + * this mess in KVM. Before running for the first time we yet + * again ensure that the GIC and GCIE fields accurately reflect + * the actual hardware the guest should see. + * + * This hack allows legacy QEMU-based GICv3 guests to run + * unmodified on compatible GICv5 hosts, and avoids the inverse + * problem for GICv5-based guests in the future. + */ + kvm_vgic_finalize_idregs(kvm); + } + return 0; } diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index e9b8b5fc480c..933983bb2005 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -66,12 +66,11 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); * or through the generic KVM_CREATE_DEVICE API ioctl. * irqchip_in_kernel() tells you if this function succeeded or not. * @kvm: kvm struct pointer - * @type: KVM_DEV_TYPE_ARM_VGIC_V[23] + * @type: KVM_DEV_TYPE_ARM_VGIC_V[235] */ int kvm_vgic_create(struct kvm *kvm, u32 type) { struct kvm_vcpu *vcpu; - u64 aa64pfr0, pfr1; unsigned long i; int ret; @@ -132,8 +131,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) if (type == KVM_DEV_TYPE_ARM_VGIC_V2) kvm->max_vcpus = VGIC_V2_MAX_CPUS; - else + else if (type == KVM_DEV_TYPE_ARM_VGIC_V3) kvm->max_vcpus = VGIC_V3_MAX_CPUS; + else if (type == KVM_DEV_TYPE_ARM_VGIC_V5) + kvm->max_vcpus = min(VGIC_V5_MAX_CPUS, + kvm_vgic_global_state.max_gic_vcpus); if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) { ret = -E2BIG; @@ -145,19 +147,20 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST; kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; - pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; - - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V2: kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - } else { + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); - aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); - pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + break; } - kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); - kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); + /* + * We've now created the GIC. Update the system register state + * to accurately reflect what we've created. + */ + kvm_vgic_finalize_idregs(kvm); kvm_for_each_vcpu(i, vcpu, kvm) { ret = vgic_allocate_private_irqs_locked(vcpu, type); @@ -179,6 +182,15 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) if (type == KVM_DEV_TYPE_ARM_VGIC_V3) kvm->arch.vgic.nassgicap = system_supports_direct_sgis(); + /* + * We now know that we have a GICv5. The Arch Timer PPI interrupts may + * have been initialised at this stage, but will have done so assuming + * that we have an older GIC, meaning that the IntIDs won't be + * correct. We init them again, and this time they will be correct. + */ + if (type == KVM_DEV_TYPE_ARM_VGIC_V5) + kvm_timer_init_vm(kvm); + out_unlock: mutex_unlock(&kvm->arch.config_lock); kvm_unlock_all_vcpus(kvm); @@ -259,9 +271,65 @@ int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu) return ret; } +static void vgic_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type) +{ + struct vgic_irq *irq = &vcpu->arch.vgic_cpu.private_irqs[i]; + + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + refcount_set(&irq->refcount, 0); + + irq->intid = i; + if (vgic_irq_is_sgi(i)) { + /* SGIs */ + irq->enabled = 1; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = BIT(vcpu->vcpu_id); + break; + } +} + +static void vgic_v5_allocate_private_irq(struct kvm_vcpu *vcpu, int i, u32 type) +{ + struct vgic_irq *irq = &vcpu->arch.vgic_cpu.private_irqs[i]; + u32 intid = vgic_v5_make_ppi(i); + + INIT_LIST_HEAD(&irq->ap_list); + raw_spin_lock_init(&irq->irq_lock); + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + refcount_set(&irq->refcount, 0); + + irq->intid = intid; + + /* The only Edge architected PPI is the SW_PPI */ + if (i == GICV5_ARCH_PPI_SW_PPI) + irq->config = VGIC_CONFIG_EDGE; + else + irq->config = VGIC_CONFIG_LEVEL; + + /* Register the GICv5-specific PPI ops */ + vgic_v5_set_ppi_ops(vcpu, intid); +} + static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u32 num_private_irqs; int i; lockdep_assert_held(&vcpu->kvm->arch.config_lock); @@ -269,8 +337,13 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) if (vgic_cpu->private_irqs) return 0; + if (vgic_is_v5(vcpu->kvm)) + num_private_irqs = VGIC_V5_NR_PRIVATE_IRQS; + else + num_private_irqs = VGIC_NR_PRIVATE_IRQS; + vgic_cpu->private_irqs = kzalloc_objs(struct vgic_irq, - VGIC_NR_PRIVATE_IRQS, + num_private_irqs, GFP_KERNEL_ACCOUNT); if (!vgic_cpu->private_irqs) @@ -280,34 +353,11 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) * Enable and configure all SGIs to be edge-triggered and * configure all PPIs as level-triggered. */ - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - - INIT_LIST_HEAD(&irq->ap_list); - raw_spin_lock_init(&irq->irq_lock); - irq->intid = i; - irq->vcpu = NULL; - irq->target_vcpu = vcpu; - refcount_set(&irq->refcount, 0); - if (vgic_irq_is_sgi(i)) { - /* SGIs */ - irq->enabled = 1; - irq->config = VGIC_CONFIG_EDGE; - } else { - /* PPIs */ - irq->config = VGIC_CONFIG_LEVEL; - } - - switch (type) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->group = 1; - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->group = 0; - irq->targets = BIT(vcpu->vcpu_id); - break; - } + for (i = 0; i < num_private_irqs; i++) { + if (vgic_is_v5(vcpu->kvm)) + vgic_v5_allocate_private_irq(vcpu, i, type); + else + vgic_allocate_private_irq(vcpu, i, type); } return 0; @@ -366,7 +416,11 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu) { - if (kvm_vgic_global_state.type == VGIC_V2) + const struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V5) + vgic_v5_reset(vcpu); + else if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_reset(vcpu); else vgic_v3_reset(vcpu); @@ -397,22 +451,28 @@ int vgic_init(struct kvm *kvm) if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) return -EBUSY; - /* freeze the number of spis */ - if (!dist->nr_spis) - dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; + if (!vgic_is_v5(kvm)) { + /* freeze the number of spis */ + if (!dist->nr_spis) + dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS; - ret = kvm_vgic_dist_init(kvm, dist->nr_spis); - if (ret) - goto out; - - /* - * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs, - * vLPIs) is supported. - */ - if (vgic_supports_direct_irqs(kvm)) { - ret = vgic_v4_init(kvm); + ret = kvm_vgic_dist_init(kvm, dist->nr_spis); if (ret) - goto out; + return ret; + + /* + * Ensure vPEs are allocated if direct IRQ injection (e.g. vSGIs, + * vLPIs) is supported. + */ + if (vgic_supports_direct_irqs(kvm)) { + ret = vgic_v4_init(kvm); + if (ret) + return ret; + } + } else { + ret = vgic_v5_init(kvm); + if (ret) + return ret; } kvm_for_each_vcpu(idx, vcpu, kvm) @@ -420,12 +480,12 @@ int vgic_init(struct kvm *kvm) ret = kvm_vgic_setup_default_irq_routing(kvm); if (ret) - goto out; + return ret; vgic_debug_init(kvm); dist->initialized = true; -out: - return ret; + + return 0; } static void kvm_vgic_dist_destroy(struct kvm *kvm) @@ -569,6 +629,7 @@ int vgic_lazy_init(struct kvm *kvm) int kvm_vgic_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + bool needs_dist = true; enum vgic_type type; gpa_t dist_base; int ret = 0; @@ -587,21 +648,29 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { ret = vgic_v2_map_resources(kvm); type = VGIC_V2; - } else { + } else if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { ret = vgic_v3_map_resources(kvm); type = VGIC_V3; + } else { + ret = vgic_v5_map_resources(kvm); + type = VGIC_V5; + needs_dist = false; } if (ret) goto out; - dist_base = dist->vgic_dist_base; - mutex_unlock(&kvm->arch.config_lock); + if (needs_dist) { + dist_base = dist->vgic_dist_base; + mutex_unlock(&kvm->arch.config_lock); - ret = vgic_register_dist_iodev(kvm, dist_base, type); - if (ret) { - kvm_err("Unable to register VGIC dist MMIO regions\n"); - goto out_slots; + ret = vgic_register_dist_iodev(kvm, dist_base, type); + if (ret) { + kvm_err("Unable to register VGIC dist MMIO regions\n"); + goto out_slots; + } + } else { + mutex_unlock(&kvm->arch.config_lock); } smp_store_release(&dist->ready, true); @@ -617,6 +686,35 @@ out_slots: return ret; } +void kvm_vgic_finalize_idregs(struct kvm *kvm) +{ + u32 type = kvm->arch.vgic.vgic_model; + u64 aa64pfr0, aa64pfr2, pfr1; + + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + aa64pfr2 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE; + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V2: + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + if (kvm_supports_32bit_el0()) + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + break; + case KVM_DEV_TYPE_ARM_VGIC_V5: + aa64pfr2 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP); + break; + default: + WARN_ONCE(1, "Unknown VGIC type!!!\n"); + } + + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, aa64pfr2); + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); +} + /* GENERIC PROBE */ void kvm_vgic_cpu_up(void) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 3d1a776b716d..a96c77dccf35 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -336,6 +336,10 @@ int kvm_register_vgic_device(unsigned long type) break; ret = kvm_vgic_register_its_device(); break; + case KVM_DEV_TYPE_ARM_VGIC_V5: + ret = kvm_register_device_ops(&kvm_arm_vgic_v5_ops, + KVM_DEV_TYPE_ARM_VGIC_V5); + break; } return ret; @@ -639,7 +643,7 @@ static int vgic_v3_set_attr(struct kvm_device *dev, if (vgic_initialized(dev->kvm)) return -EBUSY; - if (!irq_is_ppi(val)) + if (!irq_is_ppi(dev->kvm, val)) return -EINVAL; dev->kvm->arch.vgic.mi_intid = val; @@ -715,3 +719,104 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = { .get_attr = vgic_v3_get_attr, .has_attr = vgic_v3_has_attr, }; + +static int vgic_v5_get_userspace_ppis(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct vgic_v5_vm *gicv5_vm = &dev->kvm->arch.vgic.gicv5_vm; + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + int ret; + + guard(mutex)(&dev->kvm->arch.config_lock); + + /* + * We either support 64 or 128 PPIs. In the former case, we need to + * return 0s for the second 64 bits as we have no storage backing those. + */ + ret = put_user(bitmap_read(gicv5_vm->userspace_ppis, 0, 64), uaddr); + if (ret) + return ret; + uaddr++; + + if (VGIC_V5_NR_PRIVATE_IRQS == 128) + ret = put_user(bitmap_read(gicv5_vm->userspace_ppis, 64, 128), uaddr); + else + ret = put_user(0, uaddr); + + return ret; +} + +static int vgic_v5_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return -ENXIO; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return vgic_set_common_attr(dev, attr); + case KVM_DEV_ARM_VGIC_USERSPACE_PPIS: + default: + return -ENXIO; + } + default: + return -ENXIO; + } + +} + +static int vgic_v5_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return -ENXIO; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return vgic_get_common_attr(dev, attr); + case KVM_DEV_ARM_VGIC_USERSPACE_PPIS: + return vgic_v5_get_userspace_ppis(dev, attr); + default: + return -ENXIO; + } + default: + return -ENXIO; + } +} + +static int vgic_v5_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return -ENXIO; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + case KVM_DEV_ARM_VGIC_USERSPACE_PPIS: + return 0; + default: + return -ENXIO; + } + default: + return -ENXIO; + } +} + +struct kvm_device_ops kvm_arm_vgic_v5_ops = { + .name = "kvm-arm-vgic-v5", + .create = vgic_create, + .destroy = vgic_destroy, + .set_attr = vgic_v5_set_attr, + .get_attr = vgic_v5_get_attr, + .has_attr = vgic_v5_has_attr, +}; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index a573b1f0c6cb..74d76dec9730 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -842,18 +842,46 @@ vgic_find_mmio_region(const struct vgic_register_region *regions, void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) { - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_vmcr(vcpu, vmcr); - else + const struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V5: + vgic_v5_set_vmcr(vcpu, vmcr); + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: vgic_v3_set_vmcr(vcpu, vmcr); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v3_set_vmcr(vcpu, vmcr); + else + vgic_v2_set_vmcr(vcpu, vmcr); + break; + default: + BUG(); + } } void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) { - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_get_vmcr(vcpu, vmcr); - else + const struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V5: + vgic_v5_get_vmcr(vcpu, vmcr); + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: vgic_v3_get_vmcr(vcpu, vmcr); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v3_get_vmcr(vcpu, vmcr); + else + vgic_v2_get_vmcr(vcpu, vmcr); + break; + default: + BUG(); + } } /* diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6a355eca1934..9e841e7afd4a 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -499,7 +499,7 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; - if (!vgic_is_v3(vcpu->kvm)) + if (!vgic_host_has_gicv3()) return; /* Hide GICv3 sysreg if necessary */ diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c index 331651087e2c..fdd39ea7f83e 100644 --- a/arch/arm64/kvm/vgic/vgic-v5.c +++ b/arch/arm64/kvm/vgic/vgic-v5.c @@ -1,28 +1,52 @@ // SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, 2026 Arm Ltd. + */ #include + +#include #include #include "vgic.h" +static struct vgic_v5_ppi_caps ppi_caps; + +/* + * Not all PPIs are guaranteed to be implemented for GICv5. Deterermine which + * ones are, and generate a mask. + */ +static void vgic_v5_get_implemented_ppis(void) +{ + if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) + return; + + /* + * If we have KVM, we have EL2, which means that we have support for the + * EL1 and EL2 Physical & Virtual timers. + */ + __assign_bit(GICV5_ARCH_PPI_CNTHP, ppi_caps.impl_ppi_mask, 1); + __assign_bit(GICV5_ARCH_PPI_CNTV, ppi_caps.impl_ppi_mask, 1); + __assign_bit(GICV5_ARCH_PPI_CNTHV, ppi_caps.impl_ppi_mask, 1); + __assign_bit(GICV5_ARCH_PPI_CNTP, ppi_caps.impl_ppi_mask, 1); + + /* The SW_PPI should be available */ + __assign_bit(GICV5_ARCH_PPI_SW_PPI, ppi_caps.impl_ppi_mask, 1); + + /* The PMUIRQ is available if we have the PMU */ + __assign_bit(GICV5_ARCH_PPI_PMUIRQ, ppi_caps.impl_ppi_mask, system_supports_pmuv3()); +} + /* * Probe for a vGICv5 compatible interrupt controller, returning 0 on success. - * Currently only supports GICv3-based VMs on a GICv5 host, and hence only - * registers a VGIC_V3 device. */ int vgic_v5_probe(const struct gic_kvm_info *info) { + bool v5_registered = false; u64 ich_vtr_el2; int ret; - if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) - return -ENODEV; - kvm_vgic_global_state.type = VGIC_V5; - kvm_vgic_global_state.has_gcie_v3_compat = true; - - /* We only support v3 compat mode - use vGICv3 limits */ - kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; kvm_vgic_global_state.vcpu_base = 0; kvm_vgic_global_state.vctrl_base = NULL; @@ -30,6 +54,38 @@ int vgic_v5_probe(const struct gic_kvm_info *info) kvm_vgic_global_state.has_gicv4 = false; kvm_vgic_global_state.has_gicv4_1 = false; + /* + * GICv5 is currently not supported in Protected mode. Skip the + * registration of GICv5 completely to make sure no guests can create a + * GICv5-based guest. + */ + if (is_protected_kvm_enabled()) { + kvm_info("GICv5-based guests are not supported with pKVM\n"); + goto skip_v5; + } + + kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS; + + vgic_v5_get_implemented_ppis(); + + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V5); + if (ret) { + kvm_err("Cannot register GICv5 KVM device.\n"); + goto skip_v5; + } + + v5_registered = true; + kvm_info("GCIE system register CPU interface\n"); + +skip_v5: + /* If we don't support the GICv3 compat mode we're done. */ + if (!cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) { + if (!v5_registered) + return -ENODEV; + return 0; + } + + kvm_vgic_global_state.has_gcie_v3_compat = true; ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); kvm_vgic_global_state.ich_vtr_el2 = (u32)ich_vtr_el2; @@ -45,6 +101,10 @@ int vgic_v5_probe(const struct gic_kvm_info *info) return ret; } + /* We potentially limit the max VCPUs further than we need to here */ + kvm_vgic_global_state.max_gic_vcpus = min(VGIC_V3_MAX_CPUS, + VGIC_V5_MAX_CPUS); + static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif); kvm_info("GCIE legacy system register CPU interface\n"); @@ -52,3 +112,424 @@ int vgic_v5_probe(const struct gic_kvm_info *info) return 0; } + +void vgic_v5_reset(struct kvm_vcpu *vcpu) +{ + /* + * We always present 16-bits of ID space to the guest, irrespective of + * the host allowing more. + */ + vcpu->arch.vgic_cpu.num_id_bits = ICC_IDR0_EL1_ID_BITS_16BITS; + + /* + * The GICv5 architeture only supports 5-bits of priority in the + * CPUIF (but potentially fewer in the IRS). + */ + vcpu->arch.vgic_cpu.num_pri_bits = 5; +} + +int vgic_v5_init(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long idx; + + if (vgic_initialized(kvm)) + return 0; + + kvm_for_each_vcpu(idx, vcpu, kvm) { + if (vcpu_has_nv(vcpu)) { + kvm_err("Nested GICv5 VMs are currently unsupported\n"); + return -EINVAL; + } + } + + /* We only allow userspace to drive the SW_PPI, if it is implemented. */ + bitmap_zero(kvm->arch.vgic.gicv5_vm.userspace_ppis, + VGIC_V5_NR_PRIVATE_IRQS); + __assign_bit(GICV5_ARCH_PPI_SW_PPI, + kvm->arch.vgic.gicv5_vm.userspace_ppis, + VGIC_V5_NR_PRIVATE_IRQS); + bitmap_and(kvm->arch.vgic.gicv5_vm.userspace_ppis, + kvm->arch.vgic.gicv5_vm.userspace_ppis, + ppi_caps.impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS); + + return 0; +} + +int vgic_v5_map_resources(struct kvm *kvm) +{ + if (!vgic_initialized(kvm)) + return -EBUSY; + + return 0; +} + +int vgic_v5_finalize_ppi_state(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu0; + int i; + + if (!vgic_is_v5(kvm)) + return 0; + + guard(mutex)(&kvm->arch.config_lock); + + /* + * If SW_PPI has been advertised, then we know we already + * initialised the whole thing, and we can return early. Yes, + * this is pretty hackish as far as state tracking goes... + */ + if (test_bit(GICV5_ARCH_PPI_SW_PPI, kvm->arch.vgic.gicv5_vm.vgic_ppi_mask)) + return 0; + + /* The PPI state for all VCPUs should be the same. Pick the first. */ + vcpu0 = kvm_get_vcpu(kvm, 0); + + bitmap_zero(kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS); + bitmap_zero(kvm->arch.vgic.gicv5_vm.vgic_ppi_hmr, VGIC_V5_NR_PRIVATE_IRQS); + + for_each_set_bit(i, ppi_caps.impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) { + const u32 intid = vgic_v5_make_ppi(i); + struct vgic_irq *irq; + + irq = vgic_get_vcpu_irq(vcpu0, intid); + + /* Expose PPIs with an owner or the SW_PPI, only */ + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { + if (irq->owner || i == GICV5_ARCH_PPI_SW_PPI) { + __assign_bit(i, kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, 1); + __assign_bit(i, kvm->arch.vgic.gicv5_vm.vgic_ppi_hmr, + irq->config == VGIC_CONFIG_LEVEL); + } + } + + vgic_put_irq(vcpu0->kvm, irq); + } + + return 0; +} + +static u32 vgic_v5_get_effective_priority_mask(struct kvm_vcpu *vcpu) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + u32 highest_ap, priority_mask, apr; + + /* + * If the guest's CPU has not opted to receive interrupts, then the + * effective running priority is the highest priority. Just return 0 + * (the highest priority). + */ + if (!FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_EN, cpu_if->vgic_vmcr)) + return 0; + + /* + * Counting the number of trailing zeros gives the current active + * priority. Explicitly use the 32-bit version here as we have 32 + * priorities. 32 then means that there are no active priorities. + */ + apr = cpu_if->vgic_apr; + highest_ap = apr ? __builtin_ctz(apr) : 32; + + /* + * An interrupt is of sufficient priority if it is equal to or + * greater than the priority mask. Add 1 to the priority mask + * (i.e., lower priority) to match the APR logic before taking + * the min. This gives us the lowest priority that is masked. + */ + priority_mask = FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_VPMR, cpu_if->vgic_vmcr); + + return min(highest_ap, priority_mask + 1); +} + +/* + * For GICv5, the PPIs are mostly directly managed by the hardware. We (the + * hypervisor) handle the pending, active, enable state save/restore, but don't + * need the PPIs to be queued on a per-VCPU AP list. Therefore, sanity check the + * state, unlock, and return. + */ +bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags) + __releases(&irq->irq_lock) +{ + struct kvm_vcpu *vcpu; + + lockdep_assert_held(&irq->irq_lock); + + if (WARN_ON_ONCE(!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, irq->intid))) + goto out_unlock_fail; + + vcpu = irq->target_vcpu; + if (WARN_ON_ONCE(!vcpu)) + goto out_unlock_fail; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + /* Directly kick the target VCPU to make sure it sees the IRQ */ + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return true; + +out_unlock_fail: + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + return false; +} + +/* + * Sets/clears the corresponding bit in the ICH_PPI_DVIR register. + */ +void vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool dvi) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + u32 ppi; + + lockdep_assert_held(&irq->irq_lock); + + ppi = vgic_v5_get_hwirq_id(irq->intid); + __assign_bit(ppi, cpu_if->vgic_ppi_dvir, dvi); +} + +static struct irq_ops vgic_v5_ppi_irq_ops = { + .queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock, + .set_direct_injection = vgic_v5_set_ppi_dvi, +}; + +void vgic_v5_set_ppi_ops(struct kvm_vcpu *vcpu, u32 vintid) +{ + kvm_vgic_set_irq_ops(vcpu, vintid, &vgic_v5_ppi_irq_ops); +} + +/* + * Sync back the PPI priorities to the vgic_irq shadow state for any interrupts + * exposed to the guest (skipping all others). + */ +static void vgic_v5_sync_ppi_priorities(struct kvm_vcpu *vcpu) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + u64 priorityr; + int i; + + /* + * We have up to 16 PPI Priority regs, but only have a few interrupts + * that the guest is allowed to use. Limit our sync of PPI priorities to + * those actually exposed to the guest by first iterating over the mask + * of exposed PPIs. + */ + for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) { + u32 intid = vgic_v5_make_ppi(i); + struct vgic_irq *irq; + int pri_idx, pri_reg, pri_bit; + u8 priority; + + /* + * Determine which priority register and the field within it to + * extract. + */ + pri_reg = i / 8; + pri_idx = i % 8; + pri_bit = pri_idx * 8; + + priorityr = cpu_if->vgic_ppi_priorityr[pri_reg]; + priority = field_get(GENMASK(pri_bit + 4, pri_bit), priorityr); + + irq = vgic_get_vcpu_irq(vcpu, intid); + + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) + irq->priority = priority; + + vgic_put_irq(vcpu->kvm, irq); + } +} + +bool vgic_v5_has_pending_ppi(struct kvm_vcpu *vcpu) +{ + unsigned int priority_mask; + int i; + + priority_mask = vgic_v5_get_effective_priority_mask(vcpu); + + /* + * If the combined priority mask is 0, nothing can be signalled! In the + * case where the guest has disabled interrupt delivery for the vcpu + * (via ICV_CR0_EL1.EN->ICH_VMCR_EL2.EN), we calculate the priority mask + * as 0 too (the highest possible priority). + */ + if (!priority_mask) + return false; + + for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS) { + u32 intid = vgic_v5_make_ppi(i); + bool has_pending = false; + struct vgic_irq *irq; + + irq = vgic_get_vcpu_irq(vcpu, intid); + + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) + if (irq->enabled && irq->priority < priority_mask) + has_pending = irq->hw ? vgic_get_phys_line_level(irq) : irq_is_pending(irq); + + vgic_put_irq(vcpu->kvm, irq); + + if (has_pending) + return true; + } + + return false; +} + +/* + * Detect any PPIs state changes, and propagate the state with KVM's + * shadow structures. + */ +void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + unsigned long *activer, *pendr; + int i; + + activer = host_data_ptr(vgic_v5_ppi_state)->activer_exit; + pendr = host_data_ptr(vgic_v5_ppi_state)->pendr; + + for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, + VGIC_V5_NR_PRIVATE_IRQS) { + u32 intid = vgic_v5_make_ppi(i); + struct vgic_irq *irq; + + irq = vgic_get_vcpu_irq(vcpu, intid); + + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { + irq->active = test_bit(i, activer); + + /* This is an OR to avoid losing incoming edges! */ + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch |= test_bit(i, pendr); + } + + vgic_put_irq(vcpu->kvm, irq); + } + + /* + * Re-inject the exit state as entry state next time! + * + * Note that the write of the Enable state is trapped, and hence there + * is nothing to explcitly sync back here as we already have the latest + * copy by definition. + */ + bitmap_copy(cpu_if->vgic_ppi_activer, activer, VGIC_V5_NR_PRIVATE_IRQS); +} + +void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu) +{ + DECLARE_BITMAP(pendr, VGIC_V5_NR_PRIVATE_IRQS); + int i; + + /* + * Time to enter the guest - we first need to build the guest's + * ICC_PPI_PENDRx_EL1, however. + */ + bitmap_zero(pendr, VGIC_V5_NR_PRIVATE_IRQS); + for_each_set_bit(i, vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask, + VGIC_V5_NR_PRIVATE_IRQS) { + u32 intid = vgic_v5_make_ppi(i); + struct vgic_irq *irq; + + irq = vgic_get_vcpu_irq(vcpu, intid); + + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { + __assign_bit(i, pendr, irq_is_pending(irq)); + if (irq->config == VGIC_CONFIG_EDGE) + irq->pending_latch = false; + } + + vgic_put_irq(vcpu->kvm, irq); + } + + /* + * Copy the shadow state to the pending reg that will be written to the + * ICH_PPI_PENDRx_EL2 regs. While the guest is running we track any + * incoming changes to the pending state in the vgic_irq structures. The + * incoming changes are merged with the outgoing changes on the return + * path. + */ + bitmap_copy(host_data_ptr(vgic_v5_ppi_state)->pendr, pendr, + VGIC_V5_NR_PRIVATE_IRQS); +} + +void vgic_v5_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + + /* + * On the WFI path, vgic_load is called a second time. The first is when + * scheduling in the vcpu thread again, and the second is when leaving + * WFI. Skip the second instance as it serves no purpose and just + * restores the same state again. + */ + if (cpu_if->gicv5_vpe.resident) + return; + + kvm_call_hyp(__vgic_v5_restore_vmcr_apr, cpu_if); + + cpu_if->gicv5_vpe.resident = true; +} + +void vgic_v5_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + + /* + * Do nothing if we're not resident. This can happen in the WFI path + * where we do a vgic_put in the WFI path and again later when + * descheduling the thread. We risk losing VMCR state if we sync it + * twice, so instead return early in this case. + */ + if (!cpu_if->gicv5_vpe.resident) + return; + + kvm_call_hyp(__vgic_v5_save_apr, cpu_if); + + cpu_if->gicv5_vpe.resident = false; + + /* The shadow priority is only updated on entering WFI */ + if (vcpu_get_flag(vcpu, IN_WFI)) + vgic_v5_sync_ppi_priorities(vcpu); +} + +void vgic_v5_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + u64 vmcr = cpu_if->vgic_vmcr; + + vmcrp->en = FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_EN, vmcr); + vmcrp->pmr = FIELD_GET(FEAT_GCIE_ICH_VMCR_EL2_VPMR, vmcr); +} + +void vgic_v5_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + u64 vmcr; + + vmcr = FIELD_PREP(FEAT_GCIE_ICH_VMCR_EL2_VPMR, vmcrp->pmr) | + FIELD_PREP(FEAT_GCIE_ICH_VMCR_EL2_EN, vmcrp->en); + + cpu_if->vgic_vmcr = vmcr; +} + +void vgic_v5_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + + __vgic_v5_restore_state(cpu_if); + __vgic_v5_restore_ppi_state(cpu_if); + dsb(sy); +} + +void vgic_v5_save_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5; + + __vgic_v5_save_state(cpu_if); + __vgic_v5_save_ppi_state(cpu_if); + dsb(sy); +} diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index e22b79cfff96..1e9fe8764584 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -86,6 +86,10 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) */ struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid) { + /* Non-private IRQs are not yet implemented for GICv5 */ + if (vgic_is_v5(kvm)) + return NULL; + /* SPIs */ if (intid >= VGIC_NR_PRIVATE_IRQS && intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { @@ -94,7 +98,7 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid) } /* LPIs */ - if (intid >= VGIC_MIN_LPI) + if (irq_is_lpi(kvm, intid)) return vgic_get_lpi(kvm, intid); return NULL; @@ -105,6 +109,18 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid) if (WARN_ON(!vcpu)) return NULL; + if (vgic_is_v5(vcpu->kvm)) { + u32 int_num, hwirq_id; + + if (!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, intid)) + return NULL; + + hwirq_id = FIELD_GET(GICV5_HWIRQ_ID, intid); + int_num = array_index_nospec(hwirq_id, VGIC_V5_NR_PRIVATE_IRQS); + + return &vcpu->arch.vgic_cpu.private_irqs[int_num]; + } + /* SGIs and PPIs */ if (intid < VGIC_NR_PRIVATE_IRQS) { intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS); @@ -123,7 +139,7 @@ static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { - if (irq->intid < VGIC_MIN_LPI) + if (!irq_is_lpi(kvm, irq->intid)) return false; return refcount_dec_and_test(&irq->refcount); @@ -148,7 +164,7 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) * Acquire/release it early on lockdep kernels to make locking issues * in rare release paths a bit more obvious. */ - if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) { + if (IS_ENABLED(CONFIG_LOCKDEP) && irq_is_lpi(kvm, irq->intid)) { guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock); } @@ -186,7 +202,7 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { - if (irq->intid >= VGIC_MIN_LPI) { + if (irq_is_lpi(vcpu->kvm, irq->intid)) { raw_spin_lock(&irq->irq_lock); list_del(&irq->ap_list); irq->vcpu = NULL; @@ -404,6 +420,9 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, lockdep_assert_held(&irq->irq_lock); + if (irq->ops && irq->ops->queue_irq_unlock) + return irq->ops->queue_irq_unlock(kvm, irq, flags); + retry: vcpu = vgic_target_oracle(irq); if (irq->vcpu || !vcpu) { @@ -521,12 +540,12 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (ret) return ret; - if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS) + if (!vcpu && irq_is_private(kvm, intid)) return -EINVAL; trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level); - if (intid < VGIC_NR_PRIVATE_IRQS) + if (irq_is_private(kvm, intid)) irq = vgic_get_vcpu_irq(vcpu, intid); else irq = vgic_get_irq(kvm, intid); @@ -553,10 +572,27 @@ int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, return 0; } +void kvm_vgic_set_irq_ops(struct kvm_vcpu *vcpu, u32 vintid, + struct irq_ops *ops) +{ + struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); + + BUG_ON(!irq); + + scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) + irq->ops = ops; + + vgic_put_irq(vcpu->kvm, irq); +} + +void kvm_vgic_clear_irq_ops(struct kvm_vcpu *vcpu, u32 vintid) +{ + kvm_vgic_set_irq_ops(vcpu, vintid, NULL); +} + /* @irq->irq_lock must be held */ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - unsigned int host_irq, - struct irq_ops *ops) + unsigned int host_irq) { struct irq_desc *desc; struct irq_data *data; @@ -576,20 +612,25 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, irq->hw = true; irq->host_irq = host_irq; irq->hwintid = data->hwirq; - irq->ops = ops; + + if (irq->ops && irq->ops->set_direct_injection) + irq->ops->set_direct_injection(vcpu, irq, true); + return 0; } /* @irq->irq_lock must be held */ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) { + if (irq->ops && irq->ops->set_direct_injection) + irq->ops->set_direct_injection(irq->target_vcpu, irq, false); + irq->hw = false; irq->hwintid = 0; - irq->ops = NULL; } int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, - u32 vintid, struct irq_ops *ops) + u32 vintid) { struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid); unsigned long flags; @@ -598,7 +639,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, BUG_ON(!irq); raw_spin_lock_irqsave(&irq->irq_lock, flags); - ret = kvm_vgic_map_irq(vcpu, irq, host_irq, ops); + ret = kvm_vgic_map_irq(vcpu, irq, host_irq); raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); @@ -685,7 +726,7 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) return -EAGAIN; /* SGIs and LPIs cannot be wired up to any device */ - if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid)) + if (!irq_is_ppi(vcpu->kvm, intid) && !vgic_valid_spi(vcpu->kvm, intid)) return -EINVAL; irq = vgic_get_vcpu_irq(vcpu, intid); @@ -812,8 +853,13 @@ retry: vgic_release_deleted_lpis(vcpu->kvm); } -static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) +static void vgic_fold_state(struct kvm_vcpu *vcpu) { + if (vgic_is_v5(vcpu->kvm)) { + vgic_v5_fold_ppi_state(vcpu); + return; + } + if (!*host_data_ptr(last_lr_irq)) return; @@ -1002,7 +1048,10 @@ static inline bool can_access_vgic_from_kernel(void) static inline void vgic_save_state(struct kvm_vcpu *vcpu) { - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + /* No switch statement here. See comment in vgic_restore_state() */ + if (vgic_is_v5(vcpu->kvm)) + vgic_v5_save_state(vcpu); + else if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_save_state(vcpu); else __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); @@ -1011,20 +1060,24 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu) /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - /* If nesting, emulate the HW effect from L0 to L1 */ - if (vgic_state_is_nested(vcpu)) { - vgic_v3_sync_nested(vcpu); - return; - } + if (vgic_is_v3(vcpu->kvm)) { + /* If nesting, emulate the HW effect from L0 to L1 */ + if (vgic_state_is_nested(vcpu)) { + vgic_v3_sync_nested(vcpu); + return; + } - if (vcpu_has_nv(vcpu)) - vgic_v3_nested_update_mi(vcpu); + if (vcpu_has_nv(vcpu)) + vgic_v3_nested_update_mi(vcpu); + } if (can_access_vgic_from_kernel()) vgic_save_state(vcpu); - vgic_fold_lr_state(vcpu); - vgic_prune_ap_list(vcpu); + vgic_fold_state(vcpu); + + if (!vgic_is_v5(vcpu->kvm)) + vgic_prune_ap_list(vcpu); } /* Sync interrupts that were deactivated through a DIR trap */ @@ -1040,12 +1093,34 @@ void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu) static inline void vgic_restore_state(struct kvm_vcpu *vcpu) { - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + /* + * As nice as it would be to restructure this code into a switch + * statement as can be found elsewhere, the logic quickly gets ugly. + * + * __vgic_v3_restore_state() is doing a lot of heavy lifting here. It is + * required for GICv3-on-GICv3, GICv2-on-GICv3, GICv3-on-GICv5, and the + * no-in-kernel-irqchip case on GICv3 hardware. Hence, adding a switch + * here results in much more complex code. + */ + if (vgic_is_v5(vcpu->kvm)) + vgic_v5_restore_state(vcpu); + else if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_restore_state(vcpu); else __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } +static void vgic_flush_state(struct kvm_vcpu *vcpu) +{ + if (vgic_is_v5(vcpu->kvm)) { + vgic_v5_flush_ppi_state(vcpu); + return; + } + + scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock) + vgic_flush_lr_state(vcpu); +} + /* Flush our emulation state into the GIC hardware before entering the guest. */ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { @@ -1082,42 +1157,69 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock) - vgic_flush_lr_state(vcpu); + vgic_flush_state(vcpu); if (can_access_vgic_from_kernel()) vgic_restore_state(vcpu); - if (vgic_supports_direct_irqs(vcpu->kvm)) + if (vgic_supports_direct_irqs(vcpu->kvm) && kvm_vgic_global_state.has_gicv4) vgic_v4_commit(vcpu); } void kvm_vgic_load(struct kvm_vcpu *vcpu) { + const struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; } - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - vgic_v2_load(vcpu); - else + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V5: + vgic_v5_load(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: vgic_v3_load(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v3_load(vcpu); + else + vgic_v2_load(vcpu); + break; + default: + BUG(); + } } void kvm_vgic_put(struct kvm_vcpu *vcpu) { + const struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; } - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - vgic_v2_put(vcpu); - else + switch (dist->vgic_model) { + case KVM_DEV_TYPE_ARM_VGIC_V5: + vgic_v5_put(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V3: vgic_v3_put(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vgic_v3_put(vcpu); + else + vgic_v2_put(vcpu); + break; + default: + BUG(); + } } int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) @@ -1128,6 +1230,9 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) unsigned long flags; struct vgic_vmcr vmcr; + if (vgic_is_v5(vcpu->kvm)) + return vgic_v5_has_pending_ppi(vcpu); + if (!vcpu->kvm->arch.vgic.enabled) return false; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index c9b3bb07e483..9d941241c8a2 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -187,6 +187,7 @@ static inline u64 vgic_ich_hcr_trap_bits(void) * registers regardless of the hardware backed GIC used. */ struct vgic_vmcr { + u32 en; /* GICv5-specific */ u32 grpen0; u32 grpen1; @@ -363,6 +364,19 @@ void vgic_debug_init(struct kvm *kvm); void vgic_debug_destroy(struct kvm *kvm); int vgic_v5_probe(const struct gic_kvm_info *info); +void vgic_v5_reset(struct kvm_vcpu *vcpu); +int vgic_v5_init(struct kvm *kvm); +int vgic_v5_map_resources(struct kvm *kvm); +void vgic_v5_set_ppi_ops(struct kvm_vcpu *vcpu, u32 vintid); +bool vgic_v5_has_pending_ppi(struct kvm_vcpu *vcpu); +void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu); +void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu); +void vgic_v5_load(struct kvm_vcpu *vcpu); +void vgic_v5_put(struct kvm_vcpu *vcpu); +void vgic_v5_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v5_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +void vgic_v5_restore_state(struct kvm_vcpu *vcpu); +void vgic_v5_save_state(struct kvm_vcpu *vcpu); static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) { @@ -425,15 +439,6 @@ void vgic_its_invalidate_all_caches(struct kvm *kvm); int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); int vgic_its_invall(struct kvm_vcpu *vcpu); -bool system_supports_direct_sgis(void); -bool vgic_supports_direct_msis(struct kvm *kvm); -bool vgic_supports_direct_sgis(struct kvm *kvm); - -static inline bool vgic_supports_direct_irqs(struct kvm *kvm) -{ - return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm); -} - int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); void vgic_v4_configure_vsgis(struct kvm *kvm); @@ -447,6 +452,11 @@ static inline bool kvm_has_gicv3(struct kvm *kvm) return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } +static inline bool kvm_has_gicv5(struct kvm *kvm) +{ + return kvm_has_feat(kvm, ID_AA64PFR2_EL1, GCIE, IMP); +} + void vgic_v3_flush_nested(struct kvm_vcpu *vcpu); void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); void vgic_v3_load_nested(struct kvm_vcpu *vcpu); @@ -454,15 +464,32 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu); void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); -static inline bool vgic_is_v3_compat(struct kvm *kvm) +static inline bool vgic_host_has_gicv3(void) { - return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) && + /* + * Either the host is a native GICv3, or it is GICv5 with + * FEAT_GCIE_LEGACY. + */ + return kvm_vgic_global_state.type == VGIC_V3 || kvm_vgic_global_state.has_gcie_v3_compat; } -static inline bool vgic_is_v3(struct kvm *kvm) +static inline bool vgic_host_has_gicv5(void) { - return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm); + return kvm_vgic_global_state.type == VGIC_V5; +} + +bool system_supports_direct_sgis(void); +bool vgic_supports_direct_msis(struct kvm *kvm); +bool vgic_supports_direct_sgis(struct kvm *kvm); + +static inline bool vgic_supports_direct_irqs(struct kvm *kvm) +{ + /* GICv5 always supports direct IRQs */ + if (vgic_is_v5(kvm)) + return true; + + return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm); } int vgic_its_debug_init(struct kvm_device *dev); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index be9dab2c7d6a..7eacc7b45c1f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -43,6 +43,7 @@ #include #include #include +#include struct fault_info { int (*fn)(unsigned long far, unsigned long esr, @@ -269,6 +270,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr return false; } +static bool is_pkvm_stage2_abort(unsigned int esr) +{ + /* + * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor + * injected a stage-2 abort -- see host_inject_mem_abort(). + */ + return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW); +} + static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) @@ -289,8 +299,14 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, * If we now have a valid translation, treat the translation fault as * spurious. */ - if (!(par & SYS_PAR_EL1_F)) + if (!(par & SYS_PAR_EL1_F)) { + if (is_pkvm_stage2_abort(esr)) { + par &= SYS_PAR_EL1_PA; + return pkvm_force_reclaim_guest_page(par); + } + return true; + } /* * If we got a different type of fault from the AT instruction, @@ -376,9 +392,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr)) return; - if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), - "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr)) + if (is_spurious_el1_translation_fault(addr, esr, regs)) { + WARN_RATELIMIT(!is_pkvm_stage2_abort(esr), + "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr); return; + } if (is_el1_mte_sync_tag_check_fault(esr)) { do_tag_recovery(addr, esr, regs); @@ -395,6 +413,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, msg = "read from unreadable memory"; } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; + } else if (is_pkvm_stage2_abort(esr)) { + msg = "access to hypervisor-protected memory"; } else { if (esr_fsc_is_translation_fault(esr) && kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) @@ -621,6 +641,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, addr, esr, regs); } + if (is_pkvm_stage2_abort(esr)) { + if (!user_mode(regs)) + goto no_context; + arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault"); + return 0; + } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (!(mm_flags & FAULT_FLAG_USER)) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 9d1c21108057..3b57cb692c5b 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3243,6 +3243,14 @@ UnsignedEnum 3:0 ID_BITS EndEnum EndSysreg +Sysreg ICC_HPPIR_EL1 3 0 12 10 3 +Res0 63:33 +Field 32 HPPIV +Field 31:29 TYPE +Res0 28:24 +Field 23:0 ID +EndSysreg + Sysreg ICC_ICSR_EL1 3 0 12 10 4 Res0 63:48 Field 47:32 IAFFID @@ -3257,6 +3265,11 @@ Field 1 Enabled Field 0 F EndSysreg +Sysreg ICC_IAFFIDR_EL1 3 0 12 10 5 +Res0 63:16 +Field 15:0 IAFFID +EndSysreg + SysregFields ICC_PPI_ENABLERx_EL1 Field 63 EN63 Field 62 EN62 @@ -3663,6 +3676,42 @@ Res0 14:12 Field 11:0 AFFINITY EndSysreg +Sysreg ICC_APR_EL1 3 1 12 0 0 +Res0 63:32 +Field 31 P31 +Field 30 P30 +Field 29 P29 +Field 28 P28 +Field 27 P27 +Field 26 P26 +Field 25 P25 +Field 24 P24 +Field 23 P23 +Field 22 P22 +Field 21 P21 +Field 20 P20 +Field 19 P19 +Field 18 P18 +Field 17 P17 +Field 16 P16 +Field 15 P15 +Field 14 P14 +Field 13 P13 +Field 12 P12 +Field 11 P11 +Field 10 P10 +Field 9 P9 +Field 8 P8 +Field 7 P7 +Field 6 P6 +Field 5 P5 +Field 4 P4 +Field 3 P3 +Field 2 P2 +Field 1 P1 +Field 0 P0 +EndSysreg + Sysreg ICC_CR0_EL1 3 1 12 0 1 Res0 63:39 Field 38 PID @@ -4687,6 +4736,42 @@ Field 31:16 PhyPARTID29 Field 15:0 PhyPARTID28 EndSysreg +Sysreg ICH_APR_EL2 3 4 12 8 4 +Res0 63:32 +Field 31 P31 +Field 30 P30 +Field 29 P29 +Field 28 P28 +Field 27 P27 +Field 26 P26 +Field 25 P25 +Field 24 P24 +Field 23 P23 +Field 22 P22 +Field 21 P21 +Field 20 P20 +Field 19 P19 +Field 18 P18 +Field 17 P17 +Field 16 P16 +Field 15 P15 +Field 14 P14 +Field 13 P13 +Field 12 P12 +Field 11 P11 +Field 10 P10 +Field 9 P9 +Field 8 P8 +Field 7 P7 +Field 6 P6 +Field 5 P5 +Field 4 P4 +Field 3 P3 +Field 2 P2 +Field 1 P1 +Field 0 P0 +EndSysreg + Sysreg ICH_HFGRTR_EL2 3 4 12 9 4 Res0 63:21 Field 20 ICC_PPI_ACTIVERn_EL1 @@ -4735,6 +4820,306 @@ Field 1 GICCDDIS Field 0 GICCDEN EndSysreg +SysregFields ICH_PPI_DVIRx_EL2 +Field 63 DVI63 +Field 62 DVI62 +Field 61 DVI61 +Field 60 DVI60 +Field 59 DVI59 +Field 58 DVI58 +Field 57 DVI57 +Field 56 DVI56 +Field 55 DVI55 +Field 54 DVI54 +Field 53 DVI53 +Field 52 DVI52 +Field 51 DVI51 +Field 50 DVI50 +Field 49 DVI49 +Field 48 DVI48 +Field 47 DVI47 +Field 46 DVI46 +Field 45 DVI45 +Field 44 DVI44 +Field 43 DVI43 +Field 42 DVI42 +Field 41 DVI41 +Field 40 DVI40 +Field 39 DVI39 +Field 38 DVI38 +Field 37 DVI37 +Field 36 DVI36 +Field 35 DVI35 +Field 34 DVI34 +Field 33 DVI33 +Field 32 DVI32 +Field 31 DVI31 +Field 30 DVI30 +Field 29 DVI29 +Field 28 DVI28 +Field 27 DVI27 +Field 26 DVI26 +Field 25 DVI25 +Field 24 DVI24 +Field 23 DVI23 +Field 22 DVI22 +Field 21 DVI21 +Field 20 DVI20 +Field 19 DVI19 +Field 18 DVI18 +Field 17 DVI17 +Field 16 DVI16 +Field 15 DVI15 +Field 14 DVI14 +Field 13 DVI13 +Field 12 DVI12 +Field 11 DVI11 +Field 10 DVI10 +Field 9 DVI9 +Field 8 DVI8 +Field 7 DVI7 +Field 6 DVI6 +Field 5 DVI5 +Field 4 DVI4 +Field 3 DVI3 +Field 2 DVI2 +Field 1 DVI1 +Field 0 DVI0 +EndSysregFields + +Sysreg ICH_PPI_DVIR0_EL2 3 4 12 10 0 +Fields ICH_PPI_DVIRx_EL2 +EndSysreg + +Sysreg ICH_PPI_DVIR1_EL2 3 4 12 10 1 +Fields ICH_PPI_DVIRx_EL2 +EndSysreg + +SysregFields ICH_PPI_ENABLERx_EL2 +Field 63 EN63 +Field 62 EN62 +Field 61 EN61 +Field 60 EN60 +Field 59 EN59 +Field 58 EN58 +Field 57 EN57 +Field 56 EN56 +Field 55 EN55 +Field 54 EN54 +Field 53 EN53 +Field 52 EN52 +Field 51 EN51 +Field 50 EN50 +Field 49 EN49 +Field 48 EN48 +Field 47 EN47 +Field 46 EN46 +Field 45 EN45 +Field 44 EN44 +Field 43 EN43 +Field 42 EN42 +Field 41 EN41 +Field 40 EN40 +Field 39 EN39 +Field 38 EN38 +Field 37 EN37 +Field 36 EN36 +Field 35 EN35 +Field 34 EN34 +Field 33 EN33 +Field 32 EN32 +Field 31 EN31 +Field 30 EN30 +Field 29 EN29 +Field 28 EN28 +Field 27 EN27 +Field 26 EN26 +Field 25 EN25 +Field 24 EN24 +Field 23 EN23 +Field 22 EN22 +Field 21 EN21 +Field 20 EN20 +Field 19 EN19 +Field 18 EN18 +Field 17 EN17 +Field 16 EN16 +Field 15 EN15 +Field 14 EN14 +Field 13 EN13 +Field 12 EN12 +Field 11 EN11 +Field 10 EN10 +Field 9 EN9 +Field 8 EN8 +Field 7 EN7 +Field 6 EN6 +Field 5 EN5 +Field 4 EN4 +Field 3 EN3 +Field 2 EN2 +Field 1 EN1 +Field 0 EN0 +EndSysregFields + +Sysreg ICH_PPI_ENABLER0_EL2 3 4 12 10 2 +Fields ICH_PPI_ENABLERx_EL2 +EndSysreg + +Sysreg ICH_PPI_ENABLER1_EL2 3 4 12 10 3 +Fields ICH_PPI_ENABLERx_EL2 +EndSysreg + +SysregFields ICH_PPI_PENDRx_EL2 +Field 63 PEND63 +Field 62 PEND62 +Field 61 PEND61 +Field 60 PEND60 +Field 59 PEND59 +Field 58 PEND58 +Field 57 PEND57 +Field 56 PEND56 +Field 55 PEND55 +Field 54 PEND54 +Field 53 PEND53 +Field 52 PEND52 +Field 51 PEND51 +Field 50 PEND50 +Field 49 PEND49 +Field 48 PEND48 +Field 47 PEND47 +Field 46 PEND46 +Field 45 PEND45 +Field 44 PEND44 +Field 43 PEND43 +Field 42 PEND42 +Field 41 PEND41 +Field 40 PEND40 +Field 39 PEND39 +Field 38 PEND38 +Field 37 PEND37 +Field 36 PEND36 +Field 35 PEND35 +Field 34 PEND34 +Field 33 PEND33 +Field 32 PEND32 +Field 31 PEND31 +Field 30 PEND30 +Field 29 PEND29 +Field 28 PEND28 +Field 27 PEND27 +Field 26 PEND26 +Field 25 PEND25 +Field 24 PEND24 +Field 23 PEND23 +Field 22 PEND22 +Field 21 PEND21 +Field 20 PEND20 +Field 19 PEND19 +Field 18 PEND18 +Field 17 PEND17 +Field 16 PEND16 +Field 15 PEND15 +Field 14 PEND14 +Field 13 PEND13 +Field 12 PEND12 +Field 11 PEND11 +Field 10 PEND10 +Field 9 PEND9 +Field 8 PEND8 +Field 7 PEND7 +Field 6 PEND6 +Field 5 PEND5 +Field 4 PEND4 +Field 3 PEND3 +Field 2 PEND2 +Field 1 PEND1 +Field 0 PEND0 +EndSysregFields + +Sysreg ICH_PPI_PENDR0_EL2 3 4 12 10 4 +Fields ICH_PPI_PENDRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PENDR1_EL2 3 4 12 10 5 +Fields ICH_PPI_PENDRx_EL2 +EndSysreg + +SysregFields ICH_PPI_ACTIVERx_EL2 +Field 63 ACTIVE63 +Field 62 ACTIVE62 +Field 61 ACTIVE61 +Field 60 ACTIVE60 +Field 59 ACTIVE59 +Field 58 ACTIVE58 +Field 57 ACTIVE57 +Field 56 ACTIVE56 +Field 55 ACTIVE55 +Field 54 ACTIVE54 +Field 53 ACTIVE53 +Field 52 ACTIVE52 +Field 51 ACTIVE51 +Field 50 ACTIVE50 +Field 49 ACTIVE49 +Field 48 ACTIVE48 +Field 47 ACTIVE47 +Field 46 ACTIVE46 +Field 45 ACTIVE45 +Field 44 ACTIVE44 +Field 43 ACTIVE43 +Field 42 ACTIVE42 +Field 41 ACTIVE41 +Field 40 ACTIVE40 +Field 39 ACTIVE39 +Field 38 ACTIVE38 +Field 37 ACTIVE37 +Field 36 ACTIVE36 +Field 35 ACTIVE35 +Field 34 ACTIVE34 +Field 33 ACTIVE33 +Field 32 ACTIVE32 +Field 31 ACTIVE31 +Field 30 ACTIVE30 +Field 29 ACTIVE29 +Field 28 ACTIVE28 +Field 27 ACTIVE27 +Field 26 ACTIVE26 +Field 25 ACTIVE25 +Field 24 ACTIVE24 +Field 23 ACTIVE23 +Field 22 ACTIVE22 +Field 21 ACTIVE21 +Field 20 ACTIVE20 +Field 19 ACTIVE19 +Field 18 ACTIVE18 +Field 17 ACTIVE17 +Field 16 ACTIVE16 +Field 15 ACTIVE15 +Field 14 ACTIVE14 +Field 13 ACTIVE13 +Field 12 ACTIVE12 +Field 11 ACTIVE11 +Field 10 ACTIVE10 +Field 9 ACTIVE9 +Field 8 ACTIVE8 +Field 7 ACTIVE7 +Field 6 ACTIVE6 +Field 5 ACTIVE5 +Field 4 ACTIVE4 +Field 3 ACTIVE3 +Field 2 ACTIVE2 +Field 1 ACTIVE1 +Field 0 ACTIVE0 +EndSysregFields + +Sysreg ICH_PPI_ACTIVER0_EL2 3 4 12 10 6 +Fields ICH_PPI_ACTIVERx_EL2 +EndSysreg + +Sysreg ICH_PPI_ACTIVER1_EL2 3 4 12 10 7 +Fields ICH_PPI_ACTIVERx_EL2 +EndSysreg + Sysreg ICH_HCR_EL2 3 4 12 11 0 Res0 63:32 Field 31:27 EOIcount @@ -4789,6 +5174,18 @@ Field 1 V3 Field 0 En EndSysreg +Sysreg ICH_CONTEXTR_EL2 3 4 12 11 6 +Field 63 V +Field 62 F +Field 61 IRICHPPIDIS +Field 60 DB +Field 59:55 DBPM +Res0 54:48 +Field 47:32 VPE +Res0 31:16 +Field 15:0 VM +EndSysreg + Sysreg ICH_VMCR_EL2 3 4 12 11 7 Prefix FEAT_GCIE Res0 63:32 @@ -4810,6 +5207,89 @@ Field 1 VENG1 Field 0 VENG0 EndSysreg +SysregFields ICH_PPI_PRIORITYRx_EL2 +Res0 63:61 +Field 60:56 Priority7 +Res0 55:53 +Field 52:48 Priority6 +Res0 47:45 +Field 44:40 Priority5 +Res0 39:37 +Field 36:32 Priority4 +Res0 31:29 +Field 28:24 Priority3 +Res0 23:21 +Field 20:16 Priority2 +Res0 15:13 +Field 12:8 Priority1 +Res0 7:5 +Field 4:0 Priority0 +EndSysregFields + +Sysreg ICH_PPI_PRIORITYR0_EL2 3 4 12 14 0 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR1_EL2 3 4 12 14 1 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR2_EL2 3 4 12 14 2 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR3_EL2 3 4 12 14 3 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR4_EL2 3 4 12 14 4 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR5_EL2 3 4 12 14 5 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR6_EL2 3 4 12 14 6 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR7_EL2 3 4 12 14 7 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR8_EL2 3 4 12 15 0 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR9_EL2 3 4 12 15 1 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR10_EL2 3 4 12 15 2 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR11_EL2 3 4 12 15 3 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR12_EL2 3 4 12 15 4 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR13_EL2 3 4 12 15 5 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR14_EL2 3 4 12 15 6 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + +Sysreg ICH_PPI_PRIORITYR15_EL2 3 4 12 15 7 +Fields ICH_PPI_PRIORITYRx_EL2 +EndSysreg + Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 405a5eee847b..6b0903be8ebf 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -511,6 +511,23 @@ static bool gicv5_ppi_irq_is_level(irq_hw_number_t hwirq) return !!(read_ppi_sysreg_s(hwirq, PPI_HM) & bit); } +static int gicv5_ppi_irq_set_type(struct irq_data *d, unsigned int type) +{ + /* + * GICv5's PPIs do not have a configurable trigger or handling + * mode. Check that the attempt to set a type matches what the + * hardware reports in the HMR, and error on a mismatch. + */ + + if (type & IRQ_TYPE_EDGE_BOTH && gicv5_ppi_irq_is_level(d->hwirq)) + return -EINVAL; + + if (type & IRQ_TYPE_LEVEL_MASK && !gicv5_ppi_irq_is_level(d->hwirq)) + return -EINVAL; + + return 0; +} + static int gicv5_ppi_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) { if (vcpu) @@ -526,6 +543,7 @@ static const struct irq_chip gicv5_ppi_irq_chip = { .irq_mask = gicv5_ppi_irq_mask, .irq_unmask = gicv5_ppi_irq_unmask, .irq_eoi = gicv5_ppi_irq_eoi, + .irq_set_type = gicv5_ppi_irq_set_type, .irq_get_irqchip_state = gicv5_ppi_irq_get_irqchip_state, .irq_set_irqchip_state = gicv5_ppi_irq_set_irqchip_state, .irq_set_vcpu_affinity = gicv5_ppi_irq_set_vcpu_affinity, diff --git a/drivers/virt/coco/pkvm-guest/Kconfig b/drivers/virt/coco/pkvm-guest/Kconfig index d2f344f1f98f..928b8e1668cc 100644 --- a/drivers/virt/coco/pkvm-guest/Kconfig +++ b/drivers/virt/coco/pkvm-guest/Kconfig @@ -1,6 +1,6 @@ config ARM_PKVM_GUEST bool "Arm pKVM protected guest driver" - depends on ARM64 + depends on ARM64 && DMA_RESTRICTED_POOL help Protected guests running under the pKVM hypervisor on arm64 are isolated from the host and must issue hypercalls to enable diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 51c00c8fa175..edfdf139cb02 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -664,6 +664,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, fsnotify_create(d_inode(dentry->d_parent), dentry); return tracefs_end_creating(dentry); } +EXPORT_SYMBOL_GPL(tracefs_create_file); static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 7310841f4512..bf8cc9589bd0 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -10,6 +10,8 @@ #include #include +#include + enum kvm_arch_timers { TIMER_PTIMER, TIMER_VTIMER, @@ -47,7 +49,7 @@ struct arch_timer_vm_data { u64 poffset; /* The PPI for each timer, global to the VM */ - u8 ppi[NR_KVM_TIMERS]; + u32 ppi[NR_KVM_TIMERS]; }; struct arch_timer_context { @@ -130,6 +132,10 @@ void kvm_timer_init_vhe(void); #define timer_vm_data(ctx) (&(timer_context_to_vcpu(ctx)->kvm->arch.timer_data)) #define timer_irq(ctx) (timer_vm_data(ctx)->ppi[arch_timer_ctx_index(ctx)]) +#define get_vgic_ppi(k, i) (((k)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V5) ? \ + (i) : (FIELD_PREP(GICV5_HWIRQ_ID, i) | \ + FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI))) + u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, enum kvm_arch_timers tmr, enum kvm_arch_timer_regs treg); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 96754b51b411..0a36a3d5c894 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -12,6 +12,9 @@ #define KVM_ARMV8_PMU_MAX_COUNTERS 32 +/* PPI #23 - architecturally specified for GICv5 */ +#define KVM_ARMV8_PMU_GICV5_IRQ 0x20000017 + #if IS_ENABLED(CONFIG_HW_PERF_EVENTS) && IS_ENABLED(CONFIG_KVM) struct kvm_pmc { u8 idx; /* index into the pmu->pmc array */ @@ -38,7 +41,7 @@ struct arm_pmu_entry { }; bool kvm_supports_guest_pmuv3(void); -#define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) +#define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num != 0) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index f2eafc65bbf4..1388dc6028a9 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -19,7 +19,9 @@ #include #include +#include +#define VGIC_V5_MAX_CPUS 512 #define VGIC_V3_MAX_CPUS 512 #define VGIC_V2_MAX_CPUS 8 #define VGIC_NR_IRQS_LEGACY 256 @@ -31,9 +33,96 @@ #define VGIC_MIN_LPI 8192 #define KVM_IRQCHIP_NUM_PINS (1020 - 32) -#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS) -#define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \ - (irq) <= VGIC_MAX_SPI) +/* + * GICv5 supports 128 PPIs, but only the first 64 are architected. We only + * support the timers and PMU in KVM, both of which are architected. Rather than + * handling twice the state, we instead opt to only support the architected set + * in KVM for now. At a future stage, this can be bumped up to 128, if required. + */ +#define VGIC_V5_NR_PRIVATE_IRQS 64 + +#define is_v5_type(t, i) (FIELD_GET(GICV5_HWIRQ_TYPE, (i)) == (t)) + +#define __irq_is_sgi(t, i) \ + ({ \ + bool __ret; \ + \ + switch (t) { \ + case KVM_DEV_TYPE_ARM_VGIC_V5: \ + __ret = false; \ + break; \ + default: \ + __ret = (i) < VGIC_NR_SGIS; \ + } \ + \ + __ret; \ + }) + +#define __irq_is_ppi(t, i) \ + ({ \ + bool __ret; \ + \ + switch (t) { \ + case KVM_DEV_TYPE_ARM_VGIC_V5: \ + __ret = is_v5_type(GICV5_HWIRQ_TYPE_PPI, (i)); \ + break; \ + default: \ + __ret = (i) >= VGIC_NR_SGIS; \ + __ret &= (i) < VGIC_NR_PRIVATE_IRQS; \ + } \ + \ + __ret; \ + }) + +#define __irq_is_spi(t, i) \ + ({ \ + bool __ret; \ + \ + switch (t) { \ + case KVM_DEV_TYPE_ARM_VGIC_V5: \ + __ret = is_v5_type(GICV5_HWIRQ_TYPE_SPI, (i)); \ + break; \ + default: \ + __ret = (i) <= VGIC_MAX_SPI; \ + __ret &= (i) >= VGIC_NR_PRIVATE_IRQS; \ + } \ + \ + __ret; \ + }) + +#define __irq_is_lpi(t, i) \ + ({ \ + bool __ret; \ + \ + switch (t) { \ + case KVM_DEV_TYPE_ARM_VGIC_V5: \ + __ret = is_v5_type(GICV5_HWIRQ_TYPE_LPI, (i)); \ + break; \ + default: \ + __ret = (i) >= 8192; \ + } \ + \ + __ret; \ + }) + +#define irq_is_sgi(k, i) __irq_is_sgi((k)->arch.vgic.vgic_model, i) +#define irq_is_ppi(k, i) __irq_is_ppi((k)->arch.vgic.vgic_model, i) +#define irq_is_spi(k, i) __irq_is_spi((k)->arch.vgic.vgic_model, i) +#define irq_is_lpi(k, i) __irq_is_lpi((k)->arch.vgic.vgic_model, i) + +#define irq_is_private(k, i) (irq_is_ppi(k, i) || irq_is_sgi(k, i)) + +#define vgic_v5_get_hwirq_id(x) FIELD_GET(GICV5_HWIRQ_ID, (x)) +#define vgic_v5_set_hwirq_id(x) FIELD_PREP(GICV5_HWIRQ_ID, (x)) + +#define __vgic_v5_set_type(t) (FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_##t)) +#define vgic_v5_make_ppi(x) (__vgic_v5_set_type(PPI) | vgic_v5_set_hwirq_id(x)) +#define vgic_v5_make_spi(x) (__vgic_v5_set_type(SPI) | vgic_v5_set_hwirq_id(x)) +#define vgic_v5_make_lpi(x) (__vgic_v5_set_type(LPI) | vgic_v5_set_hwirq_id(x)) + +#define __vgic_is_v(k, v) ((k)->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V##v) +#define vgic_is_v3(k) (__vgic_is_v(k, 3)) +#define vgic_is_v5(k) (__vgic_is_v(k, 5)) enum vgic_type { VGIC_V2, /* Good ol' GICv2 */ @@ -101,6 +190,8 @@ enum vgic_irq_config { VGIC_CONFIG_LEVEL }; +struct vgic_irq; + /* * Per-irq ops overriding some common behavious. * @@ -119,6 +210,19 @@ struct irq_ops { * peaking into the physical GIC. */ bool (*get_input_level)(int vintid); + + /* + * Function pointer to override the queuing of an IRQ. + */ + bool (*queue_irq_unlock)(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags) __releases(&irq->irq_lock); + + /* + * Callback function pointer to either enable or disable direct + * injection for a mapped interrupt. + */ + void (*set_direct_injection)(struct kvm_vcpu *vcpu, + struct vgic_irq *irq, bool direct); }; struct vgic_irq { @@ -238,6 +342,26 @@ struct vgic_redist_region { struct list_head list; }; +struct vgic_v5_vm { + /* + * We only expose a subset of PPIs to the guest. This subset is a + * combination of the PPIs that are actually implemented and what we + * actually choose to expose. + */ + DECLARE_BITMAP(vgic_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS); + + /* A mask of the PPIs that are exposed for userspace to drive. */ + DECLARE_BITMAP(userspace_ppis, VGIC_V5_NR_PRIVATE_IRQS); + + /* + * The HMR itself is handled by the hardware, but we still need to have + * a mask that we can use when merging in pending state (only the state + * of Edge PPIs is merged back in from the guest an the HMR provides a + * convenient way to do that). + */ + DECLARE_BITMAP(vgic_ppi_hmr, VGIC_V5_NR_PRIVATE_IRQS); +}; + struct vgic_dist { bool in_kernel; bool ready; @@ -310,6 +434,11 @@ struct vgic_dist { * else. */ struct its_vm its_vm; + + /* + * GICv5 per-VM data. + */ + struct vgic_v5_vm gicv5_vm; }; struct vgic_v2_cpu_if { @@ -340,11 +469,40 @@ struct vgic_v3_cpu_if { unsigned int used_lrs; }; +struct vgic_v5_cpu_if { + u64 vgic_apr; + u64 vgic_vmcr; + + /* PPI register state */ + DECLARE_BITMAP(vgic_ppi_dvir, VGIC_V5_NR_PRIVATE_IRQS); + DECLARE_BITMAP(vgic_ppi_activer, VGIC_V5_NR_PRIVATE_IRQS); + DECLARE_BITMAP(vgic_ppi_enabler, VGIC_V5_NR_PRIVATE_IRQS); + /* We have one byte (of which 5 bits are used) per PPI for priority */ + u64 vgic_ppi_priorityr[VGIC_V5_NR_PRIVATE_IRQS / 8]; + + /* + * The ICSR is re-used across host and guest, and hence it needs to be + * saved/restored. Only one copy is required as the host should block + * preemption between executing GIC CDRCFG and acccessing the + * ICC_ICSR_EL1. A guest, of course, can never guarantee this, and hence + * it is the hyp's responsibility to keep the state constistent. + */ + u64 vgic_icsr; + + struct gicv5_vpe gicv5_vpe; +}; + +/* What PPI capabilities does a GICv5 host have */ +struct vgic_v5_ppi_caps { + DECLARE_BITMAP(impl_ppi_mask, VGIC_V5_NR_PRIVATE_IRQS); +}; + struct vgic_cpu { /* CPU vif control registers for world switch */ union { struct vgic_v2_cpu_if vgic_v2; struct vgic_v3_cpu_if vgic_v3; + struct vgic_v5_cpu_if vgic_v5; }; struct vgic_irq *private_irqs; @@ -392,13 +550,17 @@ int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); int kvm_vgic_map_resources(struct kvm *kvm); +void kvm_vgic_finalize_idregs(struct kvm *kvm); int kvm_vgic_hyp_init(void); void kvm_vgic_init_cpu_hardware(void); int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned int intid, bool level, void *owner); +void kvm_vgic_set_irq_ops(struct kvm_vcpu *vcpu, u32 vintid, + struct irq_ops *ops); +void kvm_vgic_clear_irq_ops(struct kvm_vcpu *vcpu, u32 vintid); int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, - u32 vintid, struct irq_ops *ops); + u32 vintid); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid); int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid); bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid); @@ -414,8 +576,20 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) -#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ - ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) +#define vgic_valid_spi(k, i) \ + ({ \ + bool __ret = irq_is_spi(k, i); \ + \ + switch ((k)->arch.vgic.vgic_model) { \ + case KVM_DEV_TYPE_ARM_VGIC_V5: \ + __ret &= FIELD_GET(GICV5_HWIRQ_ID, i) < (k)->arch.vgic.nr_spis; \ + break; \ + default: \ + __ret &= (i) < ((k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); \ + } \ + \ + __ret; \ + }) bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); @@ -455,6 +629,11 @@ int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); int vgic_v4_put(struct kvm_vcpu *vcpu); +int vgic_v5_finalize_ppi_state(struct kvm *kvm); +bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags); +void vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool dvi); + bool vgic_state_is_nested(struct kvm_vcpu *vcpu); /* CPU HP callbacks */ diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index b78488df6c98..40d2fce68294 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -24,6 +24,28 @@ #define GICV5_HWIRQ_TYPE_LPI UL(0x2) #define GICV5_HWIRQ_TYPE_SPI UL(0x3) +/* + * Architected PPIs + */ +#define GICV5_ARCH_PPI_S_DB_PPI 0x0 +#define GICV5_ARCH_PPI_RL_DB_PPI 0x1 +#define GICV5_ARCH_PPI_NS_DB_PPI 0x2 +#define GICV5_ARCH_PPI_SW_PPI 0x3 +#define GICV5_ARCH_PPI_HACDBSIRQ 0xf +#define GICV5_ARCH_PPI_CNTHVS 0x13 +#define GICV5_ARCH_PPI_CNTHPS 0x14 +#define GICV5_ARCH_PPI_PMBIRQ 0x15 +#define GICV5_ARCH_PPI_COMMIRQ 0x16 +#define GICV5_ARCH_PPI_PMUIRQ 0x17 +#define GICV5_ARCH_PPI_CTIIRQ 0x18 +#define GICV5_ARCH_PPI_GICMNT 0x19 +#define GICV5_ARCH_PPI_CNTHP 0x1a +#define GICV5_ARCH_PPI_CNTV 0x1b +#define GICV5_ARCH_PPI_CNTHV 0x1c +#define GICV5_ARCH_PPI_CNTPS 0x1d +#define GICV5_ARCH_PPI_CNTP 0x1e +#define GICV5_ARCH_PPI_TRBIRQ 0x1f + /* * Tables attributes */ @@ -365,6 +387,11 @@ int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type); int gicv5_irs_iste_alloc(u32 lpi); void gicv5_irs_syncr(void); +/* Embedded in kvm.arch */ +struct gicv5_vpe { + bool resident; +}; + struct gicv5_its_devtab_cfg { union { struct { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6b76e7a6f4c2..779d9ed85cbf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2366,6 +2366,7 @@ void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; extern struct kvm_device_ops kvm_arm_vgic_v3_ops; +extern struct kvm_device_ops kvm_arm_vgic_v5_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index d862fa610270..994f52b34344 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -251,4 +251,62 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); + +struct ring_buffer_desc { + int cpu; + unsigned int nr_page_va; /* excludes the meta page */ + unsigned long meta_va; + unsigned long page_va[] __counted_by(nr_page_va); +}; + +struct trace_buffer_desc { + int nr_cpus; + size_t struct_len; + char __data[]; /* list of ring_buffer_desc */ +}; + +static inline struct ring_buffer_desc *__next_ring_buffer_desc(struct ring_buffer_desc *desc) +{ + size_t len = struct_size(desc, page_va, desc->nr_page_va); + + return (struct ring_buffer_desc *)((void *)desc + len); +} + +static inline struct ring_buffer_desc *__first_ring_buffer_desc(struct trace_buffer_desc *desc) +{ + return (struct ring_buffer_desc *)(&desc->__data[0]); +} + +static inline size_t trace_buffer_desc_size(size_t buffer_size, unsigned int nr_cpus) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + struct ring_buffer_desc *rbdesc; + + return size_add(offsetof(struct trace_buffer_desc, __data), + size_mul(nr_cpus, struct_size(rbdesc, page_va, nr_pages))); +} + +#define for_each_ring_buffer_desc(__pdesc, __cpu, __trace_pdesc) \ + for (__pdesc = __first_ring_buffer_desc(__trace_pdesc), __cpu = 0; \ + (__cpu) < (__trace_pdesc)->nr_cpus; \ + (__cpu)++, __pdesc = __next_ring_buffer_desc(__pdesc)) + +struct ring_buffer_remote { + struct trace_buffer_desc *desc; + int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); + void *priv; +}; + +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu); + +struct trace_buffer * +__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key); + +#define ring_buffer_alloc_remote(remote) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_remote(remote, &__key); \ +}) #endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/linux/ring_buffer_types.h b/include/linux/ring_buffer_types.h new file mode 100644 index 000000000000..54577021a49d --- /dev/null +++ b/include/linux/ring_buffer_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RING_BUFFER_TYPES_H +#define _LINUX_RING_BUFFER_TYPES_H + +#include + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline bool test_time_stamp(u64 delta) +{ + return !!(delta & TS_DELTA_TEST); +} + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; +#endif diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h new file mode 100644 index 000000000000..21aec556293e --- /dev/null +++ b/include/linux/simple_ring_buffer.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SIMPLE_RING_BUFFER_H +#define _LINUX_SIMPLE_RING_BUFFER_H + +#include +#include +#include +#include + +/* + * Ideally those struct would stay private but the caller needs to know + * the allocation size for simple_ring_buffer_init(). + */ +struct simple_buffer_page { + struct list_head link; + struct buffer_data_page *page; + u64 entries; + u32 write; + u32 id; +}; + +struct simple_rb_per_cpu { + struct simple_buffer_page *tail_page; + struct simple_buffer_page *reader_page; + struct simple_buffer_page *head_page; + struct simple_buffer_page *bpages; + struct trace_buffer_meta *meta; + u32 nr_pages; + +#define SIMPLE_RB_UNAVAILABLE 0 +#define SIMPLE_RB_READY 1 +#define SIMPLE_RB_WRITING 2 + u32 status; + + u64 last_overrun; + u64 write_stamp; + + struct simple_rb_cbs *cbs; +}; + +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc); + +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer); + +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp); + +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable); + +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)); +#endif diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h new file mode 100644 index 000000000000..fcd1d46ea466 --- /dev/null +++ b/include/linux/trace_remote.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_H +#define _LINUX_TRACE_REMOTE_H + +#include +#include +#include + +/** + * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @init: Called once the remote has been registered. Allows the + * caller to extend the Tracefs remote directory + * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first + * time. Must return a &trace_buffer_desc + * (most likely filled with trace_remote_alloc_buffer()) + * @unload_trace_buffer: + * Called once Tracefs has no use for the trace buffer + * (most likely call trace_remote_free_buffer()) + * @enable_tracing: Called on Tracefs tracing_on. It is expected from the + * remote to allow writing. + * @swap_reader_page: Called when Tracefs consumes a new page from a + * ring-buffer. It is expected from the remote to isolate a + * @reset: Called on `echo 0 > trace`. It is expected from the + * remote to reset all ring-buffer pages. + * new reader-page from the @cpu ring-buffer. + * @enable_event: Called on events/event_name/enable. It is expected from + * the remote to allow the writing event @id. + */ +struct trace_remote_callbacks { + int (*init)(struct dentry *d, void *priv); + struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); + void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); + int (*enable_tracing)(bool enable, void *priv); + int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); + int (*enable_event)(unsigned short id, bool enable, void *priv); +}; + +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events); + +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask); + +void trace_remote_free_buffer(struct trace_buffer_desc *desc); + +#endif diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h new file mode 100644 index 000000000000..c8ae1e1f5e72 --- /dev/null +++ b/include/linux/trace_remote_event.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_EVENTS_H +#define _LINUX_TRACE_REMOTE_EVENTS_H + +struct trace_remote; +struct trace_event_fields; +struct trace_seq; + +struct remote_event_hdr { + unsigned short id; +}; + +#define REMOTE_EVENT_NAME_MAX 30 +struct remote_event { + char name[REMOTE_EVENT_NAME_MAX]; + unsigned short id; + bool enabled; + struct trace_remote *remote; + struct trace_event_fields *fields; + char *print_fmt; + void (*print)(void *evt, struct trace_seq *seq); +}; + +#define RE_STRUCT(__args...) __args +#define re_field(__type, __field) __type __field; + +#define REMOTE_EVENT_FORMAT(__name, __struct) \ + struct remote_event_format_##__name { \ + struct remote_event_hdr hdr; \ + __struct \ + } +#endif diff --git a/include/trace/define_remote_events.h b/include/trace/define_remote_events.h new file mode 100644 index 000000000000..676e803dc144 --- /dev/null +++ b/include/trace/define_remote_events.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#define REMOTE_EVENT_INCLUDE(__file) __stringify(../../__file) + +#ifdef REMOTE_EVENT_SECTION +# define __REMOTE_EVENT_SECTION(__name) __used __section(REMOTE_EVENT_SECTION"."#__name) +#else +# define __REMOTE_EVENT_SECTION(__name) +#endif + +#define REMOTE_PRINTK_COUNT_ARGS(__args...) \ + __COUNT_ARGS(, ##__args, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0) + +#define __remote_printk0() \ + trace_seq_putc(seq, '\n') + +#define __remote_printk1(__fmt) \ + trace_seq_puts(seq, " " __fmt "\n") \ + +#define __remote_printk2(__fmt, __args...) \ +do { \ + trace_seq_putc(seq, ' '); \ + trace_seq_printf(seq, __fmt, __args); \ + trace_seq_putc(seq, '\n'); \ +} while (0) + +/* Apply the appropriate trace_seq sequence according to the number of arguments */ +#define remote_printk(__args...) \ + CONCATENATE(__remote_printk, REMOTE_PRINTK_COUNT_ARGS(__args))(__args) + +#define RE_PRINTK(__args...) __args + +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + REMOTE_EVENT_FORMAT(__name, __struct); \ + static void remote_event_print_##__name(void *evt, struct trace_seq *seq) \ + { \ + struct remote_event_format_##__name __maybe_unused *__entry = evt; \ + trace_seq_puts(seq, #__name); \ + remote_printk(__printk); \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) + +#undef REMOTE_EVENT +#undef RE_PRINTK +#undef re_field +#define re_field(__type, __field) \ + { \ + .type = #__type, .name = #__field, \ + .size = sizeof(__type), .align = __alignof__(__type), \ + .is_signed = is_signed_type(__type), \ + }, +#define __entry REC +#define RE_PRINTK(__fmt, __args...) "\"" __fmt "\", " __stringify(__args) +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + static struct trace_event_fields remote_event_fields_##__name[] = { \ + __struct \ + {} \ + }; \ + static char remote_event_print_fmt_##__name[] = __printk; \ + static struct remote_event __REMOTE_EVENT_SECTION(__name) \ + remote_event_##__name = { \ + .name = #__name, \ + .id = __id, \ + .fields = remote_event_fields_##__name, \ + .print_fmt = remote_event_print_fmt_##__name, \ + .print = remote_event_print_##__name, \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7fb4fea9d38e..1cc0c9463f71 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -704,6 +704,11 @@ struct kvm_enable_cap { #define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL #define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + +#define KVM_VM_TYPE_ARM_PROTECTED (1UL << 31) +#define KVM_VM_TYPE_ARM_MASK (KVM_VM_TYPE_ARM_IPA_SIZE_MASK | \ + KVM_VM_TYPE_ARM_PROTECTED) + /* * ioctls for /dev/kvm fds: */ @@ -1227,6 +1232,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_DMSINTC, #define KVM_DEV_TYPE_LOONGARCH_DMSINTC KVM_DEV_TYPE_LOONGARCH_DMSINTC + KVM_DEV_TYPE_ARM_VGIC_V5, +#define KVM_DEV_TYPE_ARM_VGIC_V5 KVM_DEV_TYPE_ARM_VGIC_V5 KVM_DEV_TYPE_MAX, diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h index c102ef35d11e..e8185889a1c8 100644 --- a/include/uapi/linux/trace_mmap.h +++ b/include/uapi/linux/trace_mmap.h @@ -17,8 +17,8 @@ * @entries: Number of entries in the ring-buffer. * @overrun: Number of entries lost in the ring-buffer. * @read: Number of entries that have been read. - * @Reserved1: Internal use only. - * @Reserved2: Internal use only. + * @pages_lost: Number of pages overwritten by the writer. + * @pages_touched: Number of pages written by the writer. */ struct trace_buffer_meta { __u32 meta_page_size; @@ -39,8 +39,8 @@ struct trace_buffer_meta { __u64 overrun; __u64 read; - __u64 Reserved1; - __u64 Reserved2; + __u64 pages_lost; + __u64 pages_touched; }; #define TRACE_MMAP_IOCTL_GET_READER _IO('R', 0x20) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 49de13cae428..e130da35808f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1281,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG source "kernel/trace/rv/Kconfig" +config TRACE_REMOTE + bool + +config SIMPLE_RING_BUFFER + bool + +config TRACE_REMOTE_TEST + tristate "Test module for remote tracing" + select TRACE_REMOTE + select SIMPLE_RING_BUFFER + help + This trace remote includes a ring-buffer writer implementation using + "simple_ring_buffer". This is solely intending for testing. + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 04096c21d06b..d662c1a64cd5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -128,4 +128,63 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ +obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o +obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o +obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o + +# +# simple_ring_buffer is used by the pKVM hypervisor which does not have access +# to all kernel symbols. Fail the build if forbidden symbols are found. +# +# undefsyms_base generates a set of compiler and tooling-generated symbols that can +# safely be ignored for simple_ring_buffer. +# +filechk_undefsyms_base = \ + echo '$(pound)include '; \ + echo '$(pound)include '; \ + echo '$(pound)include '; \ + echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \ + echo 'void undefsyms_base(void *p, int n);'; \ + echo 'void undefsyms_base(void *p, int n) {'; \ + echo ' char buffer[256] = { 0 };'; \ + echo ' u32 u = 0;'; \ + echo ' memset((char * volatile)page, 8, PAGE_SIZE);'; \ + echo ' memset((char * volatile)buffer, 8, sizeof(buffer));'; \ + echo ' memcpy((void * volatile)p, buffer, sizeof(buffer));'; \ + echo ' cmpxchg((u32 * volatile)&u, 0, 8);'; \ + echo ' WARN_ON(n == 0xdeadbeef);'; \ + echo '}' + +$(obj)/undefsyms_base.c: FORCE + $(call filechk,undefsyms_base) + +clean-files += undefsyms_base.c + +$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c + +targets += undefsyms_base.o + +# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when +# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags +# because it is not linked into vmlinux. +KASAN_SANITIZE_undefsyms_base.o := y + +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ + __msan simple_ring_buffer \ + $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') + +quiet_cmd_check_undefined = NM $< + cmd_check_undefined = \ + undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \ + if [ -n "$$undefsyms" ]; then \ + echo "Unexpected symbols in $<:" >&2; \ + echo "$$undefsyms" >&2; \ + false; \ + fi + +$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE + $(call if_changed,check_undefined) + +always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked + libftrace-y := ftrace.o diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c new file mode 100644 index 000000000000..6c1b7701ddae --- /dev/null +++ b/kernel/trace/remote_test.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include +#include +#include +#include + +#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h +#include + +static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs); +static struct trace_buffer_desc *remote_test_buffer_desc; + +/* + * The trace_remote lock already serializes accesses from the trace_remote_callbacks. + * However write_event can still race with load/unload. + */ +static DEFINE_MUTEX(simple_rbs_lock); + +static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc) +{ + struct simple_rb_per_cpu *cpu_buffer; + struct simple_buffer_page *bpages; + int ret = -ENOMEM; + + cpu_buffer = kmalloc_obj(*cpu_buffer); + if (!cpu_buffer) + return ret; + + bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va); + if (!bpages) + goto err_free_cpu_buffer; + + ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc); + if (ret) + goto err_free_bpages; + + scoped_guard(mutex, &simple_rbs_lock) { + WARN_ON(*per_cpu_ptr(&simple_rbs, cpu)); + *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer; + } + + return 0; + +err_free_bpages: + kfree(bpages); + +err_free_cpu_buffer: + kfree(cpu_buffer); + + return ret; +} + +static void remote_test_unload_simple_rb(int cpu) +{ + struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + struct simple_buffer_page *bpages; + + if (!cpu_buffer) + return; + + guard(mutex)(&simple_rbs_lock); + + bpages = cpu_buffer->bpages; + simple_ring_buffer_unload(cpu_buffer); + kfree(bpages); + kfree(cpu_buffer); + *per_cpu_ptr(&simple_rbs, cpu) = NULL; +} + +static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused) +{ + struct ring_buffer_desc *rb_desc; + struct trace_buffer_desc *desc; + size_t desc_size; + int cpu, ret; + + if (WARN_ON(remote_test_buffer_desc)) + return ERR_PTR(-EINVAL); + + desc_size = trace_buffer_desc_size(size, num_possible_cpus()); + if (desc_size == SIZE_MAX) { + ret = -E2BIG; + goto err; + } + + desc = kmalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto err; + } + + ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask); + if (ret) + goto err_free_desc; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc); + if (ret) + goto err_unload; + } + + remote_test_buffer_desc = desc; + + return remote_test_buffer_desc; + +err_unload: + for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + remote_test_unload_simple_rb(rb_desc->cpu); + trace_remote_free_buffer(remote_test_buffer_desc); + +err_free_desc: + kfree(desc); + +err: + return ERR_PTR(ret); +} + +static void remote_test_unload(struct trace_buffer_desc *desc, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (WARN_ON(desc != remote_test_buffer_desc)) + return; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) + remote_test_unload_simple_rb(rb_desc->cpu); + + remote_test_buffer_desc = NULL; + trace_remote_free_buffer(desc); + kfree(desc); +} + +static int remote_test_enable_tracing(bool enable, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (!remote_test_buffer_desc) + return -ENODEV; + + for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu), + enable)); + return 0; +} + +static int remote_test_swap_reader_page(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_swap_reader_page(cpu_buffer); +} + +static int remote_test_reset(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_reset(cpu_buffer); +} + +static int remote_test_enable_event(unsigned short id, bool enable, void *unused) +{ + if (id != REMOTE_TEST_EVENT_ID) + return -EINVAL; + + /* + * Let's just use the struct remote_event enabled field that is turned on and off by + * trace_remote. This is a bit racy but good enough for a simple test module. + */ + return 0; +} + +static ssize_t +write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos) +{ + struct remote_event_format_selftest *evt_test; + struct simple_rb_per_cpu *cpu_buffer; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&simple_rbs_lock); + + if (!remote_event_selftest.enabled) + return -ENODEV; + + guard(preempt)(); + + cpu_buffer = *this_cpu_ptr(&simple_rbs); + if (!cpu_buffer) + return -ENODEV; + + evt_test = simple_ring_buffer_reserve(cpu_buffer, + sizeof(struct remote_event_format_selftest), + trace_clock_global()); + if (!evt_test) + return -ENODEV; + + evt_test->hdr.id = REMOTE_TEST_EVENT_ID; + evt_test->id = val; + + simple_ring_buffer_commit(cpu_buffer); + + return cnt; +} + +static const struct file_operations write_event_fops = { + .write = write_event_write, +}; + +static int remote_test_init_tracefs(struct dentry *d, void *unused) +{ + return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ? + 0 : -ENOMEM; +} + +static struct trace_remote_callbacks trace_remote_callbacks = { + .init = remote_test_init_tracefs, + .load_trace_buffer = remote_test_load, + .unload_trace_buffer = remote_test_unload, + .enable_tracing = remote_test_enable_tracing, + .swap_reader_page = remote_test_swap_reader_page, + .reset = remote_test_reset, + .enable_event = remote_test_enable_event, +}; + +static int __init remote_test_init(void) +{ + return trace_remote_register("test", &trace_remote_callbacks, NULL, + &remote_event_selftest, 1); +} + +module_init(remote_test_init); + +MODULE_DESCRIPTION("Test module for the trace remote interface"); +MODULE_AUTHOR("Vincent Donnefort"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h new file mode 100644 index 000000000000..26b93b3406fc --- /dev/null +++ b/kernel/trace/remote_test_events.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define REMOTE_TEST_EVENT_ID 1 + +REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID, + RE_STRUCT( + re_field(u64, id) + ), + RE_PRINTK("id=%llu", __entry->id) +); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 170170bd83bd..d6bebb782efc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include #include @@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order) return dpage; } -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline bool test_time_stamp(u64 delta) -{ - return !!(delta & TS_DELTA_TEST); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -555,10 +521,12 @@ struct ring_buffer_per_cpu { unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; - unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct buffer_page **subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; struct ring_buffer_cpu_meta *ring_meta; + struct ring_buffer_remote *remote; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -581,6 +549,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -636,7 +606,8 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)buffer->subbuf_size, + (unsigned int)(buffer ? buffer->subbuf_size : + PAGE_SIZE - BUF_PAGE_HDR_SIZE), (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); @@ -2238,6 +2209,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, int page_id) +{ + return page_id > desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -2245,6 +2250,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2273,6 +2279,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { bpage = alloc_cpu_page(cpu_buffer->cpu); @@ -2297,6 +2309,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); @@ -2394,6 +2416,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu, order); @@ -2453,6 +2499,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2475,7 +2524,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, unsigned long scratch_size, - struct lock_class_key *key) + struct lock_class_key *key, + struct ring_buffer_remote *remote) { struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; @@ -2515,6 +2565,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long buffers_start; @@ -2570,6 +2622,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2578,7 +2639,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2620,7 +2680,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0, 0, key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2647,7 +2707,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, - scratch_size, key); + scratch_size, key, NULL); +} + +/** + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. + */ +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) +{ + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); } void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) @@ -5274,10 +5345,61 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + +static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *next, *orig; + int retry = 3; + + orig = next = cpu_buffer->head_page; + rb_inc_page(&next); + + /* Run after the writer */ + while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) { + rb_inc_page(&next); + + rb_list_head_clear(cpu_buffer->head_page->list.prev); + rb_inc_page(&cpu_buffer->head_page); + rb_set_list_to_head(cpu_buffer->head_page->list.prev); + + if (cpu_buffer->head_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } + + orig = cpu_buffer->commit_page = cpu_buffer->head_page; + retry = 3; + + while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) { + rb_inc_page(&next); + rb_inc_page(&cpu_buffer->commit_page); + + if (cpu_buffer->commit_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + if (cpu_buffer->remote) { + rb_read_remote_meta_page(cpu_buffer); + rb_update_remote_head(cpu_buffer); + } + /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -5428,7 +5550,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + prev_head = new_reader; /* New reader was also the previous head */ + new_head = prev_head; + rb_inc_page(&new_head); + last = prev_head; + rb_dec_page(&last); + + /* Clear the old HEAD flag */ + rb_list_head_clear(cpu_buffer->head_page->list.prev); + + prev_reader->list.next = prev_head->list.next; + prev_reader->list.prev = prev_head->list.prev; + + /* Swap prev_reader with new_reader */ + last->list.next = &prev_reader->list; + new_head->list.prev = &prev_reader->list; + + new_reader->list.prev = &new_reader->list; + new_reader->list.next = &new_head->list; + + /* Reactivate the HEAD flag */ + rb_set_list_to_head(&last->list); + + cpu_buffer->head_page = new_head; + cpu_buffer->reader_page = new_reader; + cpu_buffer->pages = &new_head->list; + cpu_buffer->read_stamp = new_reader->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5598,6 +5778,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -6154,6 +6341,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; + meta->pages_lost = local_read(&cpu_buffer->pages_lost); + meta->pages_touched = local_read(&cpu_buffer->pages_touched); /* Some archs do not have data cache coherency between kernel and user-space */ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); @@ -6164,6 +6353,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6394,6 +6600,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6632,6 +6878,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int commit; unsigned int read; u64 save_timestamp; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -1; @@ -6669,6 +6916,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6678,7 +6927,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -7034,7 +7283,7 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long *subbuf_ids) + struct buffer_page **subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; @@ -7043,7 +7292,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, int id = 0; id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); - subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + subbuf_ids[id++] = cpu_buffer->reader_page; cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); @@ -7053,7 +7302,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, if (WARN_ON(id >= nr_subbufs)) break; - subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf_ids[id] = subbuf; rb_inc_page(&subbuf); id++; @@ -7062,7 +7311,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, WARN_ON(cnt != nr_subbufs); - /* install subbuf ID to kern VA translation */ + /* install subbuf ID to bpage translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); @@ -7218,13 +7467,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { + struct buffer_page *subbuf; struct page *page; int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) return -EINVAL; - page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + subbuf = cpu_buffer->subbuf_ids[s]; + page = virt_to_page((void *)subbuf->page); for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) @@ -7251,10 +7502,11 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags, *subbuf_ids; + struct buffer_page **subbuf_ids; + unsigned long flags; int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; @@ -7275,7 +7527,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, if (err) return err; - /* subbuf_ids include the reader while nr_pages does not */ + /* subbuf_ids includes the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c new file mode 100644 index 000000000000..02af2297ae5a --- /dev/null +++ b/kernel/trace/simple_ring_buffer.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include + +#include +#include + +enum simple_rb_link_type { + SIMPLE_RB_LINK_NORMAL = 0, + SIMPLE_RB_LINK_HEAD = 1, + SIMPLE_RB_LINK_HEAD_MOVING +}; + +#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) + +static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + link &= SIMPLE_RB_LINK_MASK; + link |= SIMPLE_RB_LINK_HEAD; + + /* + * Paired with simple_rb_find_head() to order access between the head + * link and overrun. It ensures we always report an up-to-date value + * after swapping the reader page. + */ + smp_store_release(&bpage->link.next, (struct list_head *)link); +} + +static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, + struct simple_buffer_page *dst, + enum simple_rb_link_type new_type) +{ + unsigned long *link = (unsigned long *)(&bpage->link.next); + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; + unsigned long new = (unsigned long)(&dst->link) | new_type; + + return try_cmpxchg(link, &old, new); +} + +static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); +} + +static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) +{ + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; + + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); +} + +static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) +{ + return simple_bpage_from_link(bpage->link.next); +} + +static void simple_bpage_reset(struct simple_buffer_page *bpage) +{ + bpage->write = 0; + bpage->entries = 0; + + local_set(&bpage->page->commit, 0); +} + +static void simple_bpage_init(struct simple_buffer_page *bpage, void *page) +{ + INIT_LIST_HEAD(&bpage->link); + bpage->page = (struct buffer_data_page *)page; + + simple_bpage_reset(bpage); +} + +#define simple_rb_meta_inc(__meta, __inc) \ + WRITE_ONCE((__meta), (__meta + __inc)) + +static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) +{ + return !!cpu_buffer->bpages; +} + +static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) +{ + int retry = cpu_buffer->nr_pages * 2; + struct simple_buffer_page *head; + + head = cpu_buffer->head_page; + + while (retry--) { + unsigned long link; + +spin: + /* See smp_store_release in simple_bpage_set_head_link() */ + link = (unsigned long)smp_load_acquire(&head->link.prev->next); + + switch (link & ~SIMPLE_RB_LINK_MASK) { + /* Found the head */ + case SIMPLE_RB_LINK_HEAD: + cpu_buffer->head_page = head; + return 0; + /* The writer caught the head, we can spin, that won't be long */ + case SIMPLE_RB_LINK_HEAD_MOVING: + goto spin; + } + + head = simple_bpage_next_page(head); + } + + return -EBUSY; +} + +/** + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader + * @cpu_buffer: A simple_rb_per_cpu + * + * This function enables consuming reading. It ensures the current head page will not be overwritten + * and can be safely read. + * + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the + * head page. + */ +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *last, *head, *reader; + unsigned long overrun; + int retry = 8; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page; + + do { + /* Run after the writer to find the head */ + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + head = cpu_buffer->head_page; + + /* Connect the reader page around the header page */ + reader->link.next = head->link.next; + reader->link.prev = head->link.prev; + + /* The last page before the head */ + last = simple_bpage_from_link(head->link.prev); + + /* The reader page points to the new header page */ + simple_bpage_set_head_link(reader); + + overrun = cpu_buffer->meta->overrun; + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); + + if (!retry) + return -EINVAL; + + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); + cpu_buffer->head_page->link.prev = &reader->link; + cpu_buffer->reader_page = head; + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; + cpu_buffer->last_overrun = overrun; + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); + +static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *tail, *new_tail; + + tail = cpu_buffer->tail_page; + new_tail = simple_bpage_next_page(tail); + + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { + /* + * Oh no! we've caught the head. There is none anymore and + * swap_reader will spin until we set the new one. Overrun must + * be written first, to make sure we report the correct number + * of lost events. + */ + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); + + simple_bpage_set_head_link(new_tail); + simple_bpage_set_normal_link(tail); + } + + simple_bpage_reset(new_tail); + cpu_buffer->tail_page = new_tail; + + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); + + return new_tail; +} + +static unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct simple_buffer_page *tail = cpu_buffer->tail_page; + struct ring_buffer_event *event; + u32 write, prev_write; + u64 time_delta; + + time_delta = timestamp - cpu_buffer->write_stamp; + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = tail->write; + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) + tail = simple_rb_move_tail(cpu_buffer); + + if (!tail->entries) { + tail->page->time_stamp = timestamp; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + tail->write = write; + tail->entries++; + + cpu_buffer->write_stamp = timestamp; + + event = (struct ring_buffer_event *)(tail->page->data + prev_write); + if (ts_ext_size) { + event = rb_event_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +/** + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @length: Size of the entry in bytes + * @timestamp: Timestamp of the entry + * + * Returns the address of the entry where to write data or NULL + */ +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp) +{ + struct ring_buffer_event *rb_event; + + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) + return NULL; + + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); + + return &rb_event->array[1]; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); + +/** + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved + */ +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->tail_page->page->commit, + cpu_buffer->tail_page->write); + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); + + /* + * Paired with simple_rb_enable_tracing() to ensure data is + * written to the ring-buffer before teardown. + */ + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); + +static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + u32 prev_status; + + if (enable) + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); + + /* Wait for the buffer to be released */ + do { + prev_status = cmpxchg_acquire(&cpu_buffer->status, + SIMPLE_RB_READY, + SIMPLE_RB_UNAVAILABLE); + } while (prev_status == SIMPLE_RB_WRITING); + + return prev_status; +} + +/** + * simple_ring_buffer_reset - Reset @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * + * This will not clear the content of the data, only reset counters and pointers + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. + */ +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *bpage; + u32 prev_status; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + prev_status = simple_rb_enable_tracing(cpu_buffer, false); + + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; + do { + simple_bpage_reset(bpage); + bpage = simple_bpage_next_page(bpage); + } while (bpage != cpu_buffer->head_page); + + simple_bpage_reset(cpu_buffer->reader_page); + + cpu_buffer->last_overrun = 0; + cpu_buffer->write_stamp = 0; + + cpu_buffer->meta->reader.read = 0; + cpu_buffer->meta->reader.lost_events = 0; + cpu_buffer->meta->entries = 0; + cpu_buffer->meta->overrun = 0; + cpu_buffer->meta->read = 0; + cpu_buffer->meta->pages_lost = 0; + cpu_buffer->meta->pages_touched = 0; + + if (prev_status == SIMPLE_RB_READY) + simple_rb_enable_tracing(cpu_buffer, true); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); + +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)) +{ + struct simple_buffer_page *bpage = bpages; + int ret = 0; + void *page; + int i; + + /* At least 1 reader page and two pages in the ring-buffer */ + if (desc->nr_page_va < 3) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->meta = load_page(desc->meta_va); + if (!cpu_buffer->meta) + return -EINVAL; + + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); + cpu_buffer->meta->meta_page_size = PAGE_SIZE; + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; + + /* The reader page is not part of the ring initially */ + page = load_page(desc->page_va[0]); + if (!page) { + unload_page(cpu_buffer->meta); + return -EINVAL; + } + + simple_bpage_init(bpage, page); + bpage->id = 0; + + cpu_buffer->nr_pages = 1; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 1; i < desc->nr_page_va; i++) { + page = load_page(desc->page_va[i]); + if (!page) { + ret = -EINVAL; + break; + } + + simple_bpage_init(++bpage, page); + + bpage->link.next = &(bpage + 1)->link; + bpage->link.prev = &(bpage - 1)->link; + bpage->id = i; + + cpu_buffer->nr_pages = i + 1; + } + + if (ret) { + for (i--; i >= 0; i--) + unload_page((void *)desc->page_va[i]); + unload_page(cpu_buffer->meta); + + return ret; + } + + /* Close the ring */ + bpage->link.next = &cpu_buffer->tail_page->link; + cpu_buffer->tail_page->link.prev = &bpage->link; + + /* The last init'ed page points to the head page */ + simple_bpage_set_head_link(bpage); + + cpu_buffer->bpages = bpages; + + return 0; +} + +static void *__load_page(unsigned long page) +{ + return (void *)page; +} + +static void __unload_page(void *page) { } + +/** + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid + */ +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)) +{ + int p; + + if (!simple_rb_loaded(cpu_buffer)) + return; + + simple_rb_enable_tracing(cpu_buffer, false); + + unload_page(cpu_buffer->meta); + for (p = 0; p < cpu_buffer->nr_pages; p++) + unload_page(cpu_buffer->bpages[p].page); + + cpu_buffer->bpages = NULL; +} + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); + +/** + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @enable: True to enable tracing, False to disable it + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded + */ +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + simple_rb_enable_tracing(cpu_buffer, enable); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a626211ceb9a..4189ec9df6a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3856,7 +3856,7 @@ static int s_show(struct seq_file *m, void *v) * Should be used after trace_array_get(), trace_types_lock * ensures that i_cdev was already initialized. */ -static inline int tracing_get_cpu(struct inode *inode) +int tracing_get_cpu(struct inode *inode) { if (inode->i_cdev) /* See trace_create_cpu_file() */ return (long)inode->i_cdev - 1; @@ -8606,7 +8606,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } -static struct dentry * +struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b8f3804586a0..fbbd0b2ee76f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -689,6 +689,13 @@ struct dentry *trace_create_file(const char *name, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *trace_create_cpu_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + long cpu, + const struct file_operations *fops); +int tracing_get_cpu(struct inode *inode); /** diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c new file mode 100644 index 000000000000..d6c3f94d67cd --- /dev/null +++ b/kernel/trace/trace_remote.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +#define TRACEFS_DIR "remotes" +#define TRACEFS_MODE_WRITE 0640 +#define TRACEFS_MODE_READ 0440 + +enum tri_type { + TRI_CONSUMING, + TRI_NONCONSUMING, +}; + +struct trace_remote_iterator { + struct trace_remote *remote; + struct trace_seq seq; + struct delayed_work poll_work; + unsigned long lost_events; + u64 ts; + struct ring_buffer_iter *rb_iter; + struct ring_buffer_iter **rb_iters; + struct remote_event_hdr *evt; + int cpu; + int evt_cpu; + loff_t pos; + enum tri_type type; +}; + +struct trace_remote { + struct trace_remote_callbacks *cbs; + void *priv; + struct trace_buffer *trace_buffer; + struct trace_buffer_desc *trace_buffer_desc; + struct dentry *dentry; + struct eventfs_inode *eventfs; + struct remote_event *events; + unsigned long nr_events; + unsigned long trace_buffer_size; + struct ring_buffer_remote rb_remote; + struct mutex lock; + struct rw_semaphore reader_lock; + struct rw_semaphore *pcpu_reader_locks; + unsigned int nr_readers; + unsigned int poll_ms; + bool tracing_on; +}; + +static bool trace_remote_loaded(struct trace_remote *remote) +{ + return !!remote->trace_buffer; +} + +static int trace_remote_load(struct trace_remote *remote) +{ + struct ring_buffer_remote *rb_remote = &remote->rb_remote; + struct trace_buffer_desc *desc; + + lockdep_assert_held(&remote->lock); + + if (trace_remote_loaded(remote)) + return 0; + + desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + rb_remote->desc = desc; + rb_remote->swap_reader_page = remote->cbs->swap_reader_page; + rb_remote->priv = remote->priv; + rb_remote->reset = remote->cbs->reset; + remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); + if (!remote->trace_buffer) { + remote->cbs->unload_trace_buffer(desc, remote->priv); + return -ENOMEM; + } + + remote->trace_buffer_desc = desc; + + return 0; +} + +static void trace_remote_try_unload(struct trace_remote *remote) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + /* The buffer is being read or writable */ + if (remote->nr_readers || remote->tracing_on) + return; + + /* The buffer has readable data */ + if (!ring_buffer_empty(remote->trace_buffer)) + return; + + ring_buffer_free(remote->trace_buffer); + remote->trace_buffer = NULL; + remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv); +} + +static int trace_remote_enable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (remote->tracing_on) + return 0; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + ret = remote->cbs->enable_tracing(true, remote->priv); + if (ret) { + trace_remote_try_unload(remote); + return ret; + } + + remote->tracing_on = true; + + return 0; +} + +static int trace_remote_disable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (!remote->tracing_on) + return 0; + + ret = remote->cbs->enable_tracing(false, remote->priv); + if (ret) + return ret; + + ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS); + remote->tracing_on = false; + trace_remote_try_unload(remote); + + return 0; +} + +static void trace_remote_reset(struct trace_remote *remote, int cpu) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) + ring_buffer_reset(remote->trace_buffer); + else + ring_buffer_reset_cpu(remote->trace_buffer, cpu); + + trace_remote_try_unload(remote); +} + +static ssize_t +tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote); + if (ret) + return ret; + + return cnt; +} +static int tracing_on_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%d\n", remote->tracing_on); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); + +static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* KiB to Bytes */ + if (!val || check_shl_overflow(val, 10, &val)) + return -EINVAL; + + guard(mutex)(&remote->lock); + + if (trace_remote_loaded(remote)) + return -EBUSY; + + remote->trace_buffer_size = val; + + return cnt; +} + +static int buffer_size_kb_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10, + trace_remote_loaded(remote) ? "loaded" : "unloaded"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb); + +static int trace_remote_get(struct trace_remote *remote, int cpu) +{ + int ret; + + if (remote->nr_readers == UINT_MAX) + return -EBUSY; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) { + int lock_cpu; + + remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks), + GFP_KERNEL); + if (!remote->pcpu_reader_locks) { + trace_remote_try_unload(remote); + return -ENOMEM; + } + + for_each_possible_cpu(lock_cpu) + init_rwsem(&remote->pcpu_reader_locks[lock_cpu]); + } + + remote->nr_readers++; + + return 0; +} + +static void trace_remote_put(struct trace_remote *remote) +{ + if (WARN_ON(!remote->nr_readers)) + return; + + remote->nr_readers--; + if (remote->nr_readers) + return; + + kfree(remote->pcpu_reader_locks); + remote->pcpu_reader_locks = NULL; + + trace_remote_try_unload(remote); +} + +static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) + return true; + + return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0; +} + +static void __poll_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct trace_remote_iterator *iter; + + iter = container_of(dwork, struct trace_remote_iterator, poll_work); + ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu); + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(iter->remote->poll_ms)); +} + +static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + ring_buffer_read_finish(iter->rb_iter); + return; + } + + for_each_possible_cpu(cpu) { + if (iter->rb_iters[cpu]) + ring_buffer_read_finish(iter->rb_iters[cpu]); + } + + kfree(iter->rb_iters); +} + +static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL); + + return iter->rb_iter ? 0 : -ENOMEM; + } + + iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL); + if (!iter->rb_iters) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu, + GFP_KERNEL); + if (!iter->rb_iters[cpu]) { + /* This CPU isn't part of trace_buffer. Skip it */ + if (!trace_remote_has_cpu(iter->remote, cpu)) + continue; + + __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS); + return -ENOMEM; + } + } + + return 0; +} + +static struct trace_remote_iterator +*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type) +{ + struct trace_remote_iterator *iter = NULL; + int ret; + + lockdep_assert_held(&remote->lock); + + if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote)) + return NULL; + + ret = trace_remote_get(remote, cpu); + if (ret) + return ERR_PTR(ret); + + if (!trace_remote_has_cpu(remote, cpu)) { + ret = -ENODEV; + goto err; + } + + iter = kzalloc_obj(*iter); + if (iter) { + iter->remote = remote; + iter->cpu = cpu; + iter->type = type; + trace_seq_init(&iter->seq); + + switch (type) { + case TRI_CONSUMING: + ring_buffer_poll_remote(remote->trace_buffer, cpu); + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote); + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms)); + break; + case TRI_NONCONSUMING: + ret = __alloc_ring_buffer_iter(iter, cpu); + break; + } + + if (ret) + goto err; + + return iter; + } + ret = -ENOMEM; + +err: + kfree(iter); + trace_remote_put(remote); + + return ERR_PTR(ret); +} + +static void trace_remote_iter_free(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote; + + if (!iter) + return; + + remote = iter->remote; + + lockdep_assert_held(&remote->lock); + + switch (iter->type) { + case TRI_CONSUMING: + cancel_delayed_work_sync(&iter->poll_work); + break; + case TRI_NONCONSUMING: + __free_ring_buffer_iter(iter, iter->cpu); + break; + } + + kfree(iter); + trace_remote_put(remote); +} + +static void trace_remote_iter_read_start(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Acquire global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + down_write(&remote->reader_lock); + else + down_read(&remote->reader_lock); + + if (cpu == RING_BUFFER_ALL_CPUS) + return; + + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + + /* Get the per-CPU one */ + if (WARN_ON_ONCE(!remote->pcpu_reader_locks)) + return; + + if (iter->type == TRI_CONSUMING) + down_write(&remote->pcpu_reader_locks[cpu]); + else + down_read(&remote->pcpu_reader_locks[cpu]); +} + +static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Release per-CPU reader lock */ + if (cpu != RING_BUFFER_ALL_CPUS) { + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + if (iter->type == TRI_CONSUMING) + up_write(&remote->pcpu_reader_locks[cpu]); + else + up_read(&remote->pcpu_reader_locks[cpu]); + } + + /* Release global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + up_write(&remote->reader_lock); + else + up_read(&remote->reader_lock); +} + +static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu) +{ + return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu]; +} + +static struct ring_buffer_event * +__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) +{ + struct ring_buffer_event *rb_evt; + struct ring_buffer_iter *rb_iter; + + switch (iter->type) { + case TRI_CONSUMING: + return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events); + case TRI_NONCONSUMING: + rb_iter = __get_rb_iter(iter, cpu); + if (!rb_iter) + return NULL; + + rb_evt = ring_buffer_iter_peek(rb_iter, ts); + if (!rb_evt) + return NULL; + + *lost_events = ring_buffer_iter_dropped(rb_iter); + + return rb_evt; + } + + return NULL; +} + +static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + struct ring_buffer_event *rb_evt; + int cpu = iter->cpu; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + return false; + + rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events); + if (!rb_evt) + return false; + + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + return true; + } + + iter->ts = U64_MAX; + for_each_possible_cpu(cpu) { + unsigned long lost_events; + u64 ts; + + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + continue; + + rb_evt = __peek_event(iter, cpu, &ts, &lost_events); + if (!rb_evt) + continue; + + if (ts >= iter->ts) + continue; + + iter->ts = ts; + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + iter->lost_events = lost_events; + } + + return iter->ts != U64_MAX; +} + +static void trace_remote_iter_move(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + + switch (iter->type) { + case TRI_CONSUMING: + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL); + break; + case TRI_NONCONSUMING: + ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu)); + break; + } +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id); + +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) +{ + struct remote_event *evt; + unsigned long usecs_rem; + u64 ts = iter->ts; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->evt_cpu, iter->lost_events); + + do_div(ts, 1000); + usecs_rem = do_div(ts, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, + ts, usecs_rem); + + evt = trace_remote_find_event(iter->remote, iter->evt->id); + if (!evt) + trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id); + else + evt->print(iter->evt, &iter->seq); + + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; +} + +static int trace_pipe_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_CONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + filp->private_data = iter; + + return IS_ERR(iter) ? PTR_ERR(iter) : 0; +} + +static int trace_pipe_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_remote *remote = iter->remote; + + guard(mutex)(&remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int ret; + +copy_to_user: + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret != -EBUSY) + return ret; + + trace_seq_init(&iter->seq); + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL); + if (ret < 0) + return ret; + + trace_remote_iter_read_start(iter); + + while (trace_remote_iter_read_event(iter)) { + int prev_len = iter->seq.seq.len; + + if (trace_remote_iter_print_event(iter)) { + iter->seq.seq.len = prev_len; + break; + } + + trace_remote_iter_move(iter); + } + + trace_remote_iter_read_finished(iter); + + goto copy_to_user; +} + +static const struct file_operations trace_pipe_fops = { + .open = trace_pipe_open, + .read = trace_pipe_read, + .release = trace_pipe_release, +}; + +static void *trace_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + + ++*pos; + + if (!iter || !trace_remote_iter_read_event(iter)) + return NULL; + + trace_remote_iter_move(iter); + iter->pos++; + + return iter; +} + +static void *trace_start(struct seq_file *m, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + loff_t i; + + if (!iter) + return NULL; + + trace_remote_iter_read_start(iter); + + if (!*pos) { + iter->pos = -1; + return trace_next(m, NULL, &i); + } + + i = iter->pos; + while (i < *pos) { + iter = trace_next(m, NULL, &i); + if (!iter) + return NULL; + } + + return iter; +} + +static int trace_show(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = v; + + trace_seq_init(&iter->seq); + + if (trace_remote_iter_print_event(iter)) { + seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id); + return 0; + } + + return trace_print_seq(m, &iter->seq); +} + +static void trace_stop(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = m->private; + + if (iter) + trace_remote_iter_read_finished(iter); +} + +static const struct seq_operations trace_sops = { + .start = trace_start, + .next = trace_next, + .show = trace_show, + .stop = trace_stop, +}; + +static int trace_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter = NULL; + int cpu = tracing_get_cpu(inode); + int ret; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + ret = seq_open(filp, &trace_sops); + if (ret) { + trace_remote_iter_free(iter); + return ret; + } + + ((struct seq_file *)filp->private_data)->private = (void *)iter; + + return 0; +} + +static int trace_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + iter = ((struct seq_file *)filp->private_data)->private; + seq_release(inode, filp); + + if (!iter) + return 0; + + guard(mutex)(&iter->remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_remote *remote = inode->i_private; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + trace_remote_reset(remote, cpu); + + return cnt; +} + +static const struct file_operations trace_fops = { + .open = trace_open, + .write = trace_write, + .read = seq_read, + .read_iter = seq_read_iter, + .release = trace_release, +}; + +static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) +{ + struct dentry *remote_d, *percpu_d, *d; + static struct dentry *root; + static DEFINE_MUTEX(lock); + bool root_inited = false; + int cpu; + + guard(mutex)(&lock); + + if (!root) { + root = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n"); + return -ENOMEM; + } + root_inited = true; + } + + remote_d = tracefs_create_dir(name, root); + if (!remote_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name); + goto err; + } + + d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops); + if (!d) + goto err; + + d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, + &buffer_size_kb_fops); + if (!d) + goto err; + + d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops); + if (!d) + goto err; + + percpu_d = tracefs_create_dir("per_cpu", remote_d); + if (!percpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); + goto err; + } + + for_each_possible_cpu(cpu) { + struct dentry *cpu_d; + char cpu_name[16]; + + snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu); + cpu_d = tracefs_create_dir(cpu_name, percpu_d); + if (!cpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n", + name, cpu); + goto err; + } + + d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, + &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu, + &trace_fops); + if (!d) + goto err; + } + + remote->dentry = remote_d; + + return 0; + +err: + if (root_inited) { + tracefs_remove(root); + root = NULL; + } else { + tracefs_remove(remote_d); + } + + return -ENOMEM; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events); + +/** + * trace_remote_register() - Register a Tracefs remote + * @name: Name of the remote, used for the Tracefs remotes/ directory. + * @cbs: Set of callbacks used to control the remote. + * @priv: Private data, passed to each callback from @cbs. + * @events: Array of events. &remote_event.name and &remote_event.id must be + * filled by the caller. + * @nr_events: Number of events in the @events array. + * + * A trace remote is an entity, outside of the kernel (most likely firmware or + * hypervisor) capable of writing events into a Tracefs compatible ring-buffer. + * The kernel would then act as a reader. + * + * The registered remote will be found under the Tracefs directory + * remotes/. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events) +{ + struct trace_remote *remote; + int ret; + + remote = kzalloc_obj(*remote); + if (!remote) + return -ENOMEM; + + remote->cbs = cbs; + remote->priv = priv; + remote->trace_buffer_size = 7 << 10; + remote->poll_ms = 100; + mutex_init(&remote->lock); + init_rwsem(&remote->reader_lock); + + if (trace_remote_init_tracefs(name, remote)) { + kfree(remote); + return -ENOMEM; + } + + ret = trace_remote_register_events(name, remote, events, nr_events); + if (ret) { + pr_err("Failed to register events for trace remote '%s' (%d)\n", + name, ret); + return ret; + } + + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; + if (ret) + pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_register); + +/** + * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer() + * @desc: Descriptor of the per-CPU ring-buffers, originally filled by + * trace_remote_alloc_buffer() + * + * Most likely called from &trace_remote_callbacks.unload_trace_buffer. + */ +void trace_remote_free_buffer(struct trace_buffer_desc *desc) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + unsigned int id; + + free_page(rb_desc->meta_va); + + for (id = 0; id < rb_desc->nr_page_va; id++) + free_page(rb_desc->page_va[id]); + } +} +EXPORT_SYMBOL_GPL(trace_remote_free_buffer); + +/** + * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer + * @desc: Uninitialized trace_buffer_desc + * @desc_size: Size of the trace_buffer_desc. Must be at least equal to + * trace_buffer_desc_size() + * @buffer_size: Size in bytes of each per-CPU ring-buffer + * @cpumask: CPUs to allocate a ring-buffer for + * + * Helper to dynamically allocate a set of pages (enough to cover @buffer_size) + * for each CPU from @cpumask and fill @desc. Most likely called from + * &trace_remote_callbacks.load_trace_buffer. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + void *desc_end = desc + desc_size; + struct ring_buffer_desc *rb_desc; + int cpu, ret = -ENOMEM; + + if (desc_size < struct_size(desc, __data, 0)) + return -EINVAL; + + desc->nr_cpus = 0; + desc->struct_len = struct_size(desc, __data, 0); + + rb_desc = (struct ring_buffer_desc *)&desc->__data[0]; + + for_each_cpu(cpu, cpumask) { + unsigned int id; + + if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) { + ret = -EINVAL; + goto err; + } + + rb_desc->cpu = cpu; + rb_desc->nr_page_va = 0; + rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->meta_va) + goto err; + + for (id = 0; id < nr_pages; id++) { + rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->page_va[id]) + goto err; + + rb_desc->nr_page_va++; + } + desc->nr_cpus++; + desc->struct_len += offsetof(struct ring_buffer_desc, page_va); + desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va); + rb_desc = __next_ring_buffer_desc(rb_desc); + } + + return 0; + +err: + trace_remote_free_buffer(desc); + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); + +static int +trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (evt->enabled == enable) + return 0; + + ret = remote->cbs->enable_event(evt->id, enable, remote->priv); + if (ret) + return ret; + + evt->enabled = enable; + + return 0; +} + +static int remote_event_enable_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->enabled); + + return 0; +} + +static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct remote_event *evt = seq->private; + struct trace_remote *remote = evt->remote; + u8 enable; + int ret; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = trace_remote_enable_event(remote, evt, enable); + if (ret) + return ret; + + return count; +} +DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable); + +static int remote_event_id_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_id); + +static int remote_event_format_show(struct seq_file *s, void *unused) +{ + size_t offset = sizeof(struct remote_event_hdr); + struct remote_event *evt = s->private; + struct trace_event_fields *field; + + seq_printf(s, "name: %s\n", evt->name); + seq_printf(s, "ID: %d\n", evt->id); + seq_puts(s, + "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n"); + + field = &evt->fields[0]; + while (field->name) { + seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->fields[0]) + seq_puts(s, "\n"); + + seq_printf(s, "print fmt: %s\n", evt->print_fmt); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_format); + +static int remote_event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_event_enable_fops; + return 1; + } + + if (!strcmp(name, "id")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_id_fops; + return 1; + } + + if (!strcmp(name, "format")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_format_fops; + return 1; + } + + return 0; +} + +static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + int i, ret; + u8 enable; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + trace_remote_enable_event(remote, evt, enable); + } + + return count; +} + +static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + const char enabled_char[] = {'0', '1', 'X'}; + char enabled_str[] = " \n"; + int i, enabled = -1; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + if (enabled == -1) { + enabled = evt->enabled; + } else if (enabled != evt->enabled) { + enabled = 2; + break; + } + } + + enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled]; + + return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2); +} + +static const struct file_operations remote_events_dir_enable_fops = { + .write = remote_events_dir_enable_write, + .read = remote_events_dir_enable_read, +}; + +static ssize_t +remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_page_header(NULL, s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_page_fops = { + .read = remote_events_dir_header_page_read, +}; + +static ssize_t +remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_entry_header(s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_event_fops = { + .read = remote_events_dir_header_event_read, +}; + +static int remote_events_dir_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_events_dir_enable_fops; + return 1; + } + + if (!strcmp(name, "header_page")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_page_fops; + return 1; + } + + if (!strcmp(name, "header_event")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_event_fops; + return 1; + } + + return 0; +} + +static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote, + struct remote_event *evt) +{ + struct eventfs_inode *eventfs = remote->eventfs; + static struct eventfs_entry dir_entries[] = { + { + .name = "enable", + .callback = remote_events_dir_callback, + }, { + .name = "header_page", + .callback = remote_events_dir_callback, + }, { + .name = "header_event", + .callback = remote_events_dir_callback, + } + }; + static struct eventfs_entry entries[] = { + { + .name = "enable", + .callback = remote_event_callback, + }, { + .name = "id", + .callback = remote_event_callback, + }, { + .name = "format", + .callback = remote_event_callback, + } + }; + bool eventfs_create = false; + + if (!eventfs) { + eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries, + ARRAY_SIZE(dir_entries), remote); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + /* + * Create similar hierarchy as local events even if a single system is supported at + * the moment + */ + eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + remote->eventfs = eventfs; + eventfs_create = true; + } + + eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt); + if (IS_ERR(eventfs)) { + if (eventfs_create) { + eventfs_remove_events_dir(remote->eventfs); + remote->eventfs = NULL; + } + return PTR_ERR(eventfs); + } + + return 0; +} + +static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events, + size_t nr_events) +{ + int i; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + if (evt->remote) + return -EEXIST; + + evt->remote = remote; + + /* We need events to be sorted for efficient lookup */ + if (i && evt->id <= events[i - 1].id) + return -EINVAL; + } + + remote->events = events; + remote->nr_events = nr_events; + + return 0; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events) +{ + int i, ret; + + ret = trace_remote_attach_events(remote, events, nr_events); + if (ret) + return ret; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + ret = trace_remote_init_eventfs(remote_name, remote, evt); + if (ret) + pr_warn("Failed to init eventfs for event '%s' (%d)", + evt->name, ret); + } + + return 0; +} + +static int __cmp_events(const void *key, const void *data) +{ + const struct remote_event *evt = data; + int id = (int)((long)key); + + return id - (int)evt->id; +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id) +{ + return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events, + sizeof(*remote->events), __cmp_events); +} diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index a792a599b9d6..1c13bfa2d38a 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -428,6 +428,7 @@ enum { #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 #define KVM_DEV_ARM_ITS_CTRL_RESET 4 +#define KVM_DEV_ARM_VGIC_USERSPACE_PPIS 5 /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 80364d4dbebb..d0c0c8605976 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1224,6 +1224,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_LOONGARCH_EIOINTC KVM_DEV_TYPE_LOONGARCH_EIOINTC KVM_DEV_TYPE_LOONGARCH_PCHPIC, #define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC + KVM_DEV_TYPE_ARM_VGIC_V5, +#define KVM_DEV_TYPE_ARM_VGIC_V5 KVM_DEV_TYPE_ARM_VGIC_V5 KVM_DEV_TYPE_MAX, diff --git a/tools/testing/selftests/ftrace/test.d/remotes/buffer_size.tc b/tools/testing/selftests/ftrace/test.d/remotes/buffer_size.tc new file mode 100644 index 000000000000..1a43280ffa97 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/buffer_size.tc @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test trace remote buffer size +# requires: remotes/test + +. $TEST_DIR/remotes/functions + +test_buffer_size() +{ + echo 0 > tracing_on + assert_unloaded + + echo 4096 > buffer_size_kb + echo 1 > tracing_on + assert_loaded + + echo 0 > tracing_on + echo 7 > buffer_size_kb +} + +if [ -z "$SOURCE_REMOTE_TEST" ]; then + set -e + setup_remote_test + test_buffer_size +fi diff --git a/tools/testing/selftests/ftrace/test.d/remotes/functions b/tools/testing/selftests/ftrace/test.d/remotes/functions new file mode 100644 index 000000000000..05224fac3653 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/functions @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: GPL-2.0 + +setup_remote() +{ + local name=$1 + + [ -e $TRACING_DIR/remotes/$name/write_event ] || exit_unresolved + + cd remotes/$name/ + echo 0 > tracing_on + clear_trace + echo 7 > buffer_size_kb + echo 0 > events/enable + echo 1 > events/$name/selftest/enable + echo 1 > tracing_on +} + +setup_remote_test() +{ + [ -d $TRACING_DIR/remotes/test/ ] || modprobe remote_test || exit_unresolved + + setup_remote "test" +} + +assert_loaded() +{ + grep -q "(loaded)" buffer_size_kb || return 1 +} + +assert_unloaded() +{ + grep -q "(unloaded)" buffer_size_kb || return 1 +} + +reload_remote() +{ + echo 0 > tracing_on + clear_trace + assert_unloaded + echo 1 > tracing_on + assert_loaded +} + +dump_trace_pipe() +{ + output=$(mktemp $TMPDIR/remote_test.XXXXXX) + cat trace_pipe > $output & + pid=$! + sleep 1 + kill -1 $pid + + echo $output +} + +check_trace() +{ + start_id="$1" + end_id="$2" + file="$3" + + # Ensure the file is not empty + test -n "$(head $file)" + + prev_ts=0 + id=0 + + # Only keep + tmp=$(mktemp $TMPDIR/remote_test.XXXXXX) + sed -e 's/\[[0-9]*\]\s*\([0-9]*.[0-9]*\): [a-z]* id=\([0-9]*\)/\1 \2/' $file > $tmp + + while IFS= read -r line; do + ts=$(echo $line | cut -d ' ' -f 1) + id=$(echo $line | cut -d ' ' -f 2) + + test $(echo "$ts>$prev_ts" | bc) -eq 1 + test $id -eq $start_id + + prev_ts=$ts + start_id=$((start_id + 1)) + done < $tmp + + test $id -eq $end_id + rm $tmp +} + +get_cpu_ids() +{ + sed -n 's/^processor\s*:\s*\([0-9]\+\).*/\1/p' /proc/cpuinfo +} + +get_page_size() +{ + sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_selftest_event_size() +{ + sed -ne 's/^.*field:.*;.*size:\([0-9][0-9]*\);.*/\1/p' events/*/selftest/format | awk '{s+=$1} END {print s}' +} diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hotplug.tc b/tools/testing/selftests/ftrace/test.d/remotes/hotplug.tc new file mode 100644 index 000000000000..145617eb8061 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/hotplug.tc @@ -0,0 +1,88 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test trace remote read with an offline CPU +# requires: remotes/test + +. $TEST_DIR/remotes/functions + +hotunplug_one_cpu() +{ + [ "$(get_cpu_ids | wc -l)" -ge 2 ] || return 1 + + for cpu in $(get_cpu_ids); do + echo 0 > /sys/devices/system/cpu/cpu$cpu/online || return 1 + break + done + + echo $cpu +} + +# Check non-consuming and consuming read +check_read() +{ + for i in $(seq 1 8); do + echo $i > write_event + done + + check_trace 1 8 trace + + output=$(dump_trace_pipe) + check_trace 1 8 $output + rm $output +} + +test_hotplug() +{ + echo 0 > trace + assert_loaded + + # + # Test a trace buffer containing an offline CPU + # + + cpu=$(hotunplug_one_cpu) || exit_unsupported + trap "echo 1 > /sys/devices/system/cpu/cpu$cpu/online" EXIT + + check_read + + # + # Test a trace buffer with a missing CPU + # + + reload_remote + + check_read + + # + # Test a trace buffer with a CPU added later + # + + echo 1 > /sys/devices/system/cpu/cpu$cpu/online + trap "" EXIT + assert_loaded + + check_read + + # Test if the ring-buffer for the newly added CPU is both writable and + # readable + for i in $(seq 1 8); do + taskset -c $cpu echo $i > write_event + done + + cd per_cpu/cpu$cpu/ + + check_trace 1 8 trace + + output=$(dump_trace_pipe) + check_trace 1 8 $output + rm $output + + cd - +} + +if [ -z "$SOURCE_REMOTE_TEST" ]; then + set -e + + setup_remote_test + test_hotplug +fi diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc new file mode 100644 index 000000000000..64bf859d6406 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc @@ -0,0 +1,11 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test hypervisor trace buffer size +# requires: remotes/hypervisor/write_event + +SOURCE_REMOTE_TEST=1 +. $TEST_DIR/remotes/buffer_size.tc + +set -e +setup_remote "hypervisor" +test_buffer_size diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc new file mode 100644 index 000000000000..580ec32c8f81 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/hotplug.tc @@ -0,0 +1,11 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test hypervisor trace read with an offline CPU +# requires: remotes/hypervisor/write_event + +SOURCE_REMOTE_TEST=1 +. $TEST_DIR/remotes/hotplug.tc + +set -e +setup_remote "hypervisor" +test_hotplug diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc new file mode 100644 index 000000000000..7fe3b09b34e3 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc @@ -0,0 +1,11 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test hypervisor trace buffer reset +# requires: remotes/hypervisor/write_event + +SOURCE_REMOTE_TEST=1 +. $TEST_DIR/remotes/reset.tc + +set -e +setup_remote "hypervisor" +test_reset diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc new file mode 100644 index 000000000000..b937c19ca7f9 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc @@ -0,0 +1,11 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test hypervisor non-consuming trace read +# requires: remotes/hypervisor/write_event + +SOURCE_REMOTE_TEST=1 +. $TEST_DIR/remotes/trace.tc + +set -e +setup_remote "hypervisor" +test_trace diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc new file mode 100644 index 000000000000..66aa1b76c147 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc @@ -0,0 +1,11 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test hypervisor consuming trace read +# requires: remotes/hypervisor/write_event + +SOURCE_REMOTE_TEST=1 +. $TEST_DIR/remotes/trace_pipe.tc + +set -e +setup_remote "hypervisor" +test_trace_pipe diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc new file mode 100644 index 000000000000..1dafde3414ab --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc @@ -0,0 +1,11 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test hypervisor trace buffer unloading +# requires: remotes/hypervisor/write_event + +SOURCE_REMOTE_TEST=1 +. $TEST_DIR/remotes/unloading.tc + +set -e +setup_remote "hypervisor" +test_unloading diff --git a/tools/testing/selftests/ftrace/test.d/remotes/reset.tc b/tools/testing/selftests/ftrace/test.d/remotes/reset.tc new file mode 100644 index 000000000000..4d176349b2bc --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/reset.tc @@ -0,0 +1,90 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test trace remote reset +# requires: remotes/test + +. $TEST_DIR/remotes/functions + +check_reset() +{ + write_event_path="write_event" + taskset="" + + clear_trace + + # Is the buffer empty? + output=$(dump_trace_pipe) + test $(wc -l $output | cut -d ' ' -f1) -eq 0 + + if $(echo $(pwd) | grep -q "per_cpu/cpu"); then + write_event_path="../../write_event" + cpu_id=$(echo $(pwd) | sed -e 's/.*per_cpu\/cpu//') + taskset="taskset -c $cpu_id" + fi + rm $output + + # Can we properly write a new event? + $taskset echo 7890 > $write_event_path + output=$(dump_trace_pipe) + test $(wc -l $output | cut -d ' ' -f1) -eq 1 + grep -q "id=7890" $output + rm $output +} + +test_global_interface() +{ + output=$(mktemp $TMPDIR/remote_test.XXXXXX) + + # Confidence check + echo 123456 > write_event + output=$(dump_trace_pipe) + grep -q "id=123456" $output + rm $output + + # Reset single event + echo 1 > write_event + check_reset + + # Reset lost events + for i in $(seq 1 10000); do + echo 1 > write_event + done + check_reset +} + +test_percpu_interface() +{ + [ "$(get_cpu_ids | wc -l)" -ge 2 ] || return 0 + + for cpu in $(get_cpu_ids); do + taskset -c $cpu echo 1 > write_event + done + + check_non_empty=0 + for cpu in $(get_cpu_ids); do + cd per_cpu/cpu$cpu/ + + if [ $check_non_empty -eq 0 ]; then + check_reset + check_non_empty=1 + else + # Check we have only reset 1 CPU + output=$(dump_trace_pipe) + test $(wc -l $output | cut -d ' ' -f1) -eq 1 + rm $output + fi + cd - + done +} + +test_reset() +{ + test_global_interface + test_percpu_interface +} + +if [ -z "$SOURCE_REMOTE_TEST" ]; then + set -e + setup_remote_test + test_reset +fi diff --git a/tools/testing/selftests/ftrace/test.d/remotes/trace.tc b/tools/testing/selftests/ftrace/test.d/remotes/trace.tc new file mode 100644 index 000000000000..bc9377a70e8d --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/trace.tc @@ -0,0 +1,102 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test trace remote non-consuming read +# requires: remotes/test + +. $TEST_DIR/remotes/functions + +test_trace() +{ + echo 0 > tracing_on + assert_unloaded + + echo 7 > buffer_size_kb + echo 1 > tracing_on + assert_loaded + + # Simple test: Emit few events and try to read them + for i in $(seq 1 8); do + echo $i > write_event + done + + check_trace 1 8 trace + + # + # Test interaction with consuming read + # + + cat trace_pipe > /dev/null & + pid=$! + + sleep 1 + kill $pid + + test $(wc -l < trace) -eq 0 + + for i in $(seq 16 32); do + echo $i > write_event + done + + check_trace 16 32 trace + + # + # Test interaction with reset + # + + echo 0 > trace + + test $(wc -l < trace) -eq 0 + + for i in $(seq 1 8); do + echo $i > write_event + done + + check_trace 1 8 trace + + # + # Test interaction with lost events + # + + # Ensure the writer is not on the reader page by reloading the buffer + reload_remote + + # Ensure ring-buffer overflow by emitting events from the same CPU + for cpu in $(get_cpu_ids); do + break + done + + events_per_page=$(($(get_page_size) / $(get_selftest_event_size))) # Approx: does not take TS into account + nr_events=$(($events_per_page * 2)) + for i in $(seq 1 $nr_events); do + taskset -c $cpu echo $i > write_event + done + + id=$(sed -n -e '1s/\[[0-9]*\]\s*[0-9]*.[0-9]*: [a-z]* id=\([0-9]*\)/\1/p' trace) + test $id -ne 1 + + check_trace $id $nr_events trace + + # + # Test per-CPU interface + # + echo 0 > trace + + for cpu in $(get_cpu_ids) ; do + taskset -c $cpu echo $cpu > write_event + done + + for cpu in $(get_cpu_ids); do + cd per_cpu/cpu$cpu/ + + check_trace $cpu $cpu trace + + cd - > /dev/null + done +} + +if [ -z "$SOURCE_REMOTE_TEST" ]; then + set -e + + setup_remote_test + test_trace +fi diff --git a/tools/testing/selftests/ftrace/test.d/remotes/trace_pipe.tc b/tools/testing/selftests/ftrace/test.d/remotes/trace_pipe.tc new file mode 100644 index 000000000000..7f7b7b79c490 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/trace_pipe.tc @@ -0,0 +1,102 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test trace remote consuming read +# requires: remotes/test + +. $TEST_DIR/remotes/functions + +test_trace_pipe() +{ + echo 0 > tracing_on + assert_unloaded + + # Emit events from the same CPU + for cpu in $(get_cpu_ids); do + break + done + + # + # Simple test: Emit enough events to fill few pages + # + + echo 1024 > buffer_size_kb + echo 1 > tracing_on + assert_loaded + + events_per_page=$(($(get_page_size) / $(get_selftest_event_size))) + nr_events=$(($events_per_page * 4)) + + output=$(mktemp $TMPDIR/remote_test.XXXXXX) + + cat trace_pipe > $output & + pid=$! + + for i in $(seq 1 $nr_events); do + taskset -c $cpu echo $i > write_event + done + + echo 0 > tracing_on + sleep 1 + kill $pid + + check_trace 1 $nr_events $output + + rm $output + + # + # Test interaction with lost events + # + + assert_unloaded + echo 7 > buffer_size_kb + echo 1 > tracing_on + assert_loaded + + nr_events=$((events_per_page * 2)) + for i in $(seq 1 $nr_events); do + taskset -c $cpu echo $i > write_event + done + + output=$(dump_trace_pipe) + + lost_events=$(sed -n -e '1s/CPU:.*\[LOST \([0-9]*\) EVENTS\]/\1/p' $output) + test -n "$lost_events" + + id=$(sed -n -e '2s/\[[0-9]*\]\s*[0-9]*.[0-9]*: [a-z]* id=\([0-9]*\)/\1/p' $output) + test "$id" -eq $(($lost_events + 1)) + + # Drop [LOST EVENTS] line + sed -i '1d' $output + + check_trace $id $nr_events $output + + rm $output + + # + # Test per-CPU interface + # + + echo 0 > trace + echo 1 > tracing_on + + for cpu in $(get_cpu_ids); do + taskset -c $cpu echo $cpu > write_event + done + + for cpu in $(get_cpu_ids); do + cd per_cpu/cpu$cpu/ + output=$(dump_trace_pipe) + + check_trace $cpu $cpu $output + + rm $output + cd - > /dev/null + done +} + +if [ -z "$SOURCE_REMOTE_TEST" ]; then + set -e + + setup_remote_test + test_trace_pipe +fi diff --git a/tools/testing/selftests/ftrace/test.d/remotes/unloading.tc b/tools/testing/selftests/ftrace/test.d/remotes/unloading.tc new file mode 100644 index 000000000000..cac2190183f6 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/remotes/unloading.tc @@ -0,0 +1,41 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Test trace remote unloading +# requires: remotes/test + +. $TEST_DIR/remotes/functions + +test_unloading() +{ + # No reader, writing + assert_loaded + + # No reader, no writing + echo 0 > tracing_on + assert_unloaded + + # 1 reader, no writing + cat trace_pipe & + pid=$! + sleep 1 + assert_loaded + kill $pid + assert_unloaded + + # No reader, no writing, events + echo 1 > tracing_on + echo 1 > write_event + echo 0 > tracing_on + assert_loaded + + # Test reset + clear_trace + assert_unloaded +} + +if [ -z "$SOURCE_REMOTE_TEST" ]; then + set -e + + setup_remote_test + test_unloading +fi diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 502c99258bd1..8ad649a8e936 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -177,8 +177,9 @@ TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config TEST_GEN_PROGS_arm64 += arm64/vgic_init TEST_GEN_PROGS_arm64 += arm64/vgic_irq TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress +TEST_GEN_PROGS_arm64 += arm64/vgic_v5 TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access -TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3 +TEST_GEN_PROGS_arm64 += arm64/no-vgic TEST_GEN_PROGS_arm64 += arm64/idreg-idst TEST_GEN_PROGS_arm64 += arm64/kvm-uuid TEST_GEN_PROGS_arm64 += access_tracking_perf_test diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c index c8ee6f520734..ce5d312ef6ba 100644 --- a/tools/testing/selftests/kvm/arm64/at.c +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -13,7 +13,6 @@ enum { CLEAR_ACCESS_FLAG, - TEST_ACCESS_FLAG, }; static u64 *ptep_hva; @@ -49,7 +48,6 @@ do { \ GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \ GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \ GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \ - GUEST_SYNC(TEST_ACCESS_FLAG); \ } \ } while (0) @@ -85,10 +83,6 @@ static void guest_code(void) if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1))) GUEST_DONE(); - /* - * KVM's software PTW makes the implementation choice that the AT - * instruction sets the access flag. - */ sysreg_clear_set(tcr_el1, 0, TCR_HA); isb(); test_at(false); @@ -102,8 +96,8 @@ static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) case CLEAR_ACCESS_FLAG: /* * Delete + reinstall the memslot to invalidate stage-2 - * mappings of the stage-1 page tables, forcing KVM to - * use the 'slow' AT emulation path. + * mappings of the stage-1 page tables, allowing KVM to + * potentially use the 'slow' AT emulation path. * * This and clearing the access flag from host userspace * ensures that the access flag cannot be set speculatively @@ -112,10 +106,6 @@ static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) clear_bit(__ffs(PTE_AF), ptep_hva); vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]); break; - case TEST_ACCESS_FLAG: - TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva), - "Expected access flag to be set (desc: %lu)", *ptep_hva); - break; default: TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]); } diff --git a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c deleted file mode 100644 index 152c34776981..000000000000 --- a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -// Check that, on a GICv3 system, not configuring GICv3 correctly -// results in all of the sysregs generating an UNDEF exception. - -#include -#include -#include - -static volatile bool handled; - -#define __check_sr_read(r) \ - ({ \ - uint64_t val; \ - \ - handled = false; \ - dsb(sy); \ - val = read_sysreg_s(SYS_ ## r); \ - val; \ - }) - -#define __check_sr_write(r) \ - do { \ - handled = false; \ - dsb(sy); \ - write_sysreg_s(0, SYS_ ## r); \ - isb(); \ - } while(0) - -/* Fatal checks */ -#define check_sr_read(r) \ - do { \ - __check_sr_read(r); \ - __GUEST_ASSERT(handled, #r " no read trap"); \ - } while(0) - -#define check_sr_write(r) \ - do { \ - __check_sr_write(r); \ - __GUEST_ASSERT(handled, #r " no write trap"); \ - } while(0) - -#define check_sr_rw(r) \ - do { \ - check_sr_read(r); \ - check_sr_write(r); \ - } while(0) - -static void guest_code(void) -{ - uint64_t val; - - /* - * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having - * hidden the feature at runtime without any other userspace action. - */ - __GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC, - read_sysreg(id_aa64pfr0_el1)) == 0, - "GICv3 wrongly advertised"); - - /* - * Access all GICv3 registers, and fail if we don't get an UNDEF. - * Note that we happily access all the APxRn registers without - * checking their existance, as all we want to see is a failure. - */ - check_sr_rw(ICC_PMR_EL1); - check_sr_read(ICC_IAR0_EL1); - check_sr_write(ICC_EOIR0_EL1); - check_sr_rw(ICC_HPPIR0_EL1); - check_sr_rw(ICC_BPR0_EL1); - check_sr_rw(ICC_AP0R0_EL1); - check_sr_rw(ICC_AP0R1_EL1); - check_sr_rw(ICC_AP0R2_EL1); - check_sr_rw(ICC_AP0R3_EL1); - check_sr_rw(ICC_AP1R0_EL1); - check_sr_rw(ICC_AP1R1_EL1); - check_sr_rw(ICC_AP1R2_EL1); - check_sr_rw(ICC_AP1R3_EL1); - check_sr_write(ICC_DIR_EL1); - check_sr_read(ICC_RPR_EL1); - check_sr_write(ICC_SGI1R_EL1); - check_sr_write(ICC_ASGI1R_EL1); - check_sr_write(ICC_SGI0R_EL1); - check_sr_read(ICC_IAR1_EL1); - check_sr_write(ICC_EOIR1_EL1); - check_sr_rw(ICC_HPPIR1_EL1); - check_sr_rw(ICC_BPR1_EL1); - check_sr_rw(ICC_CTLR_EL1); - check_sr_rw(ICC_IGRPEN0_EL1); - check_sr_rw(ICC_IGRPEN1_EL1); - - /* - * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can - * be RAO/WI. Engage in non-fatal accesses, starting with a - * write of 0 to try and disable SRE, and let's see if it - * sticks. - */ - __check_sr_write(ICC_SRE_EL1); - if (!handled) - GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n"); - - val = __check_sr_read(ICC_SRE_EL1); - if (!handled) { - __GUEST_ASSERT((val & BIT(0)), - "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n"); - GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n"); - } - - GUEST_DONE(); -} - -static void guest_undef_handler(struct ex_regs *regs) -{ - /* Success, we've gracefully exploded! */ - handled = true; - regs->pc += 4; -} - -static void test_run_vcpu(struct kvm_vcpu *vcpu) -{ - struct ucall uc; - - do { - vcpu_run(vcpu); - - switch (get_ucall(vcpu, &uc)) { - case UCALL_ABORT: - REPORT_GUEST_ASSERT(uc); - break; - case UCALL_PRINTF: - printf("%s", uc.buffer); - break; - case UCALL_DONE: - break; - default: - TEST_FAIL("Unknown ucall %lu", uc.cmd); - } - } while (uc.cmd != UCALL_DONE); -} - -static void test_guest_no_gicv3(void) -{ - struct kvm_vcpu *vcpu; - struct kvm_vm *vm; - - /* Create a VM without a GICv3 */ - vm = vm_create_with_one_vcpu(&vcpu, guest_code); - - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vcpu); - - vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, - ESR_ELx_EC_UNKNOWN, guest_undef_handler); - - test_run_vcpu(vcpu); - - kvm_vm_free(vm); -} - -int main(int argc, char *argv[]) -{ - struct kvm_vcpu *vcpu; - struct kvm_vm *vm; - uint64_t pfr0; - - test_disable_default_vgic(); - - vm = vm_create_with_one_vcpu(&vcpu, NULL); - pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); - __TEST_REQUIRE(FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr0), - "GICv3 not supported."); - kvm_vm_free(vm); - - test_guest_no_gicv3(); - - return 0; -} diff --git a/tools/testing/selftests/kvm/arm64/no-vgic.c b/tools/testing/selftests/kvm/arm64/no-vgic.c new file mode 100644 index 000000000000..b14686ef17d1 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/no-vgic.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Check that, on a GICv3-capable system (GICv3 native, or GICv5 with +// FEAT_GCIE_LEGACY), not configuring GICv3 correctly results in all +// of the sysregs generating an UNDEF exception. Do the same for GICv5 +// on a GICv5 host. + +#include +#include +#include + +#include + +static volatile bool handled; + +#define __check_sr_read(r) \ + ({ \ + uint64_t val; \ + \ + handled = false; \ + dsb(sy); \ + val = read_sysreg_s(SYS_ ## r); \ + val; \ + }) + +#define __check_sr_write(r) \ + do { \ + handled = false; \ + dsb(sy); \ + write_sysreg_s(0, SYS_ ## r); \ + isb(); \ + } while (0) + +#define __check_gicv5_gicr_op(r) \ + ({ \ + uint64_t val; \ + \ + handled = false; \ + dsb(sy); \ + val = read_sysreg_s(GICV5_OP_GICR_ ## r); \ + val; \ + }) + +#define __check_gicv5_gic_op(r) \ + do { \ + handled = false; \ + dsb(sy); \ + write_sysreg_s(0, GICV5_OP_GIC_ ## r); \ + isb(); \ + } while (0) + +/* Fatal checks */ +#define check_sr_read(r) \ + do { \ + __check_sr_read(r); \ + __GUEST_ASSERT(handled, #r " no read trap"); \ + } while (0) + +#define check_sr_write(r) \ + do { \ + __check_sr_write(r); \ + __GUEST_ASSERT(handled, #r " no write trap"); \ + } while (0) + +#define check_sr_rw(r) \ + do { \ + check_sr_read(r); \ + check_sr_write(r); \ + } while (0) + +#define check_gicv5_gicr_op(r) \ + do { \ + __check_gicv5_gicr_op(r); \ + __GUEST_ASSERT(handled, #r " no read trap"); \ + } while (0) + +#define check_gicv5_gic_op(r) \ + do { \ + __check_gicv5_gic_op(r); \ + __GUEST_ASSERT(handled, #r " no write trap"); \ + } while (0) + +static void guest_code_gicv3(void) +{ + uint64_t val; + + /* + * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having + * hidden the feature at runtime without any other userspace action. + */ + __GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC, + read_sysreg(id_aa64pfr0_el1)) == 0, + "GICv3 wrongly advertised"); + + /* + * Access all GICv3 registers, and fail if we don't get an UNDEF. + * Note that we happily access all the APxRn registers without + * checking their existence, as all we want to see is a failure. + */ + check_sr_rw(ICC_PMR_EL1); + check_sr_read(ICC_IAR0_EL1); + check_sr_write(ICC_EOIR0_EL1); + check_sr_rw(ICC_HPPIR0_EL1); + check_sr_rw(ICC_BPR0_EL1); + check_sr_rw(ICC_AP0R0_EL1); + check_sr_rw(ICC_AP0R1_EL1); + check_sr_rw(ICC_AP0R2_EL1); + check_sr_rw(ICC_AP0R3_EL1); + check_sr_rw(ICC_AP1R0_EL1); + check_sr_rw(ICC_AP1R1_EL1); + check_sr_rw(ICC_AP1R2_EL1); + check_sr_rw(ICC_AP1R3_EL1); + check_sr_write(ICC_DIR_EL1); + check_sr_read(ICC_RPR_EL1); + check_sr_write(ICC_SGI1R_EL1); + check_sr_write(ICC_ASGI1R_EL1); + check_sr_write(ICC_SGI0R_EL1); + check_sr_read(ICC_IAR1_EL1); + check_sr_write(ICC_EOIR1_EL1); + check_sr_rw(ICC_HPPIR1_EL1); + check_sr_rw(ICC_BPR1_EL1); + check_sr_rw(ICC_CTLR_EL1); + check_sr_rw(ICC_IGRPEN0_EL1); + check_sr_rw(ICC_IGRPEN1_EL1); + + /* + * ICC_SRE_EL1 may not be trappable, as ICC_SRE_EL2.Enable can + * be RAO/WI. Engage in non-fatal accesses, starting with a + * write of 0 to try and disable SRE, and let's see if it + * sticks. + */ + __check_sr_write(ICC_SRE_EL1); + if (!handled) + GUEST_PRINTF("ICC_SRE_EL1 write not trapping (OK)\n"); + + val = __check_sr_read(ICC_SRE_EL1); + if (!handled) { + __GUEST_ASSERT((val & BIT(0)), + "ICC_SRE_EL1 not trapped but ICC_SRE_EL1.SRE not set\n"); + GUEST_PRINTF("ICC_SRE_EL1 read not trapping (OK)\n"); + } + + GUEST_DONE(); +} + +static void guest_code_gicv5(void) +{ + /* + * Check that we advertise that ID_AA64PFR2_EL1.GCIE == 0, having + * hidden the feature at runtime without any other userspace action. + */ + __GUEST_ASSERT(FIELD_GET(ID_AA64PFR2_EL1_GCIE, + read_sysreg_s(SYS_ID_AA64PFR2_EL1)) == 0, + "GICv5 wrongly advertised"); + + /* + * Try all GICv5 instructions, and fail if we don't get an UNDEF. + */ + check_gicv5_gic_op(CDAFF); + check_gicv5_gic_op(CDDI); + check_gicv5_gic_op(CDDIS); + check_gicv5_gic_op(CDEOI); + check_gicv5_gic_op(CDHM); + check_gicv5_gic_op(CDPEND); + check_gicv5_gic_op(CDPRI); + check_gicv5_gic_op(CDRCFG); + check_gicv5_gicr_op(CDIA); + check_gicv5_gicr_op(CDNMIA); + + /* Check General System Register acccesses */ + check_sr_rw(ICC_APR_EL1); + check_sr_rw(ICC_CR0_EL1); + check_sr_read(ICC_HPPIR_EL1); + check_sr_read(ICC_IAFFIDR_EL1); + check_sr_rw(ICC_ICSR_EL1); + check_sr_read(ICC_IDR0_EL1); + check_sr_rw(ICC_PCR_EL1); + + /* Check PPI System Register accessess */ + check_sr_rw(ICC_PPI_CACTIVER0_EL1); + check_sr_rw(ICC_PPI_CACTIVER1_EL1); + check_sr_rw(ICC_PPI_SACTIVER0_EL1); + check_sr_rw(ICC_PPI_SACTIVER1_EL1); + check_sr_rw(ICC_PPI_CPENDR0_EL1); + check_sr_rw(ICC_PPI_CPENDR1_EL1); + check_sr_rw(ICC_PPI_SPENDR0_EL1); + check_sr_rw(ICC_PPI_SPENDR1_EL1); + check_sr_rw(ICC_PPI_ENABLER0_EL1); + check_sr_rw(ICC_PPI_ENABLER1_EL1); + check_sr_read(ICC_PPI_HMR0_EL1); + check_sr_read(ICC_PPI_HMR1_EL1); + check_sr_rw(ICC_PPI_PRIORITYR0_EL1); + check_sr_rw(ICC_PPI_PRIORITYR1_EL1); + check_sr_rw(ICC_PPI_PRIORITYR2_EL1); + check_sr_rw(ICC_PPI_PRIORITYR3_EL1); + check_sr_rw(ICC_PPI_PRIORITYR4_EL1); + check_sr_rw(ICC_PPI_PRIORITYR5_EL1); + check_sr_rw(ICC_PPI_PRIORITYR6_EL1); + check_sr_rw(ICC_PPI_PRIORITYR7_EL1); + check_sr_rw(ICC_PPI_PRIORITYR8_EL1); + check_sr_rw(ICC_PPI_PRIORITYR9_EL1); + check_sr_rw(ICC_PPI_PRIORITYR10_EL1); + check_sr_rw(ICC_PPI_PRIORITYR11_EL1); + check_sr_rw(ICC_PPI_PRIORITYR12_EL1); + check_sr_rw(ICC_PPI_PRIORITYR13_EL1); + check_sr_rw(ICC_PPI_PRIORITYR14_EL1); + check_sr_rw(ICC_PPI_PRIORITYR15_EL1); + + GUEST_DONE(); +} + +static void guest_undef_handler(struct ex_regs *regs) +{ + /* Success, we've gracefully exploded! */ + handled = true; + regs->pc += 4; +} + +static void test_run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + printf("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static void test_guest_no_vgic(void *guest_code) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Create a VM without a GIC */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_ELx_EC_UNKNOWN, guest_undef_handler); + + test_run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + bool has_v3, has_v5; + uint64_t pfr; + + test_disable_default_vgic(); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + pfr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); + has_v3 = !!FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr); + + pfr = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR2_EL1)); + has_v5 = !!FIELD_GET(ID_AA64PFR2_EL1_GCIE, pfr); + + kvm_vm_free(vm); + + __TEST_REQUIRE(has_v3 || has_v5, + "Neither GICv3 nor GICv5 supported."); + + if (has_v3) { + pr_info("Testing no-vgic-v3\n"); + test_guest_no_vgic(guest_code_gicv3); + } else { + pr_info("No GICv3 support: skipping no-vgic-v3 test\n"); + } + + if (has_v5) { + pr_info("Testing no-vgic-v5\n"); + test_guest_no_vgic(guest_code_gicv5); + } else { + pr_info("No GICv5 support: skipping no-vgic-v5 test\n"); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 73de5be58bab..7899d557c70b 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -37,6 +37,9 @@ struct reg_ftr_bits { * For FTR_LOWER_SAFE, safe_val is used as the minimal safe value. */ int64_t safe_val; + + /* Allowed to be changed by the host after run */ + bool mutable; }; struct test_feature_reg { @@ -44,7 +47,7 @@ struct test_feature_reg { const struct reg_ftr_bits *ftr_bits; }; -#define __REG_FTR_BITS(NAME, SIGNED, TYPE, SHIFT, MASK, SAFE_VAL) \ +#define __REG_FTR_BITS(NAME, SIGNED, TYPE, SHIFT, MASK, SAFE_VAL, MUT) \ { \ .name = #NAME, \ .sign = SIGNED, \ @@ -52,15 +55,20 @@ struct test_feature_reg { .shift = SHIFT, \ .mask = MASK, \ .safe_val = SAFE_VAL, \ + .mutable = MUT, \ } #define REG_FTR_BITS(type, reg, field, safe_val) \ __REG_FTR_BITS(reg##_##field, FTR_UNSIGNED, type, reg##_##field##_SHIFT, \ - reg##_##field##_MASK, safe_val) + reg##_##field##_MASK, safe_val, false) + +#define REG_FTR_BITS_MUTABLE(type, reg, field, safe_val) \ + __REG_FTR_BITS(reg##_##field, FTR_UNSIGNED, type, reg##_##field##_SHIFT, \ + reg##_##field##_MASK, safe_val, true) #define S_REG_FTR_BITS(type, reg, field, safe_val) \ __REG_FTR_BITS(reg##_##field, FTR_SIGNED, type, reg##_##field##_SHIFT, \ - reg##_##field##_MASK, safe_val) + reg##_##field##_MASK, safe_val, false) #define REG_FTR_END \ { \ @@ -134,7 +142,8 @@ static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0), - REG_FTR_BITS(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0), + /* GICv3 support will be forced at run time if available */ + REG_FTR_BITS_MUTABLE(FTR_EXACT, ID_AA64PFR0_EL1, GIC, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 1), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 1), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 1), @@ -634,12 +643,38 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu) ksft_test_result_pass("ID_AA64PFR1_EL1.MTE_frac no longer 0xF\n"); } +static uint64_t reset_mutable_bits(uint32_t id, uint64_t val) +{ + struct test_feature_reg *reg = NULL; + + for (int i = 0; i < ARRAY_SIZE(test_regs); i++) { + if (test_regs[i].reg == id) { + reg = &test_regs[i]; + break; + } + } + + if (!reg) + return val; + + for (const struct reg_ftr_bits *bits = reg->ftr_bits; bits->type != FTR_END; bits++) { + if (bits->mutable) { + val &= ~bits->mask; + val |= bits->safe_val << bits->shift; + } + } + + return val; +} + static void test_guest_reg_read(struct kvm_vcpu *vcpu) { bool done = false; struct ucall uc; while (!done) { + uint64_t val; + vcpu_run(vcpu); switch (get_ucall(vcpu, &uc)) { @@ -647,9 +682,11 @@ static void test_guest_reg_read(struct kvm_vcpu *vcpu) REPORT_GUEST_ASSERT(uc); break; case UCALL_SYNC: + val = test_reg_vals[encoding_to_range_idx(uc.args[2])]; + val = reset_mutable_bits(uc.args[2], val); + /* Make sure the written values are seen by guest */ - TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])], - uc.args[3]); + TEST_ASSERT_EQ(val, reset_mutable_bits(uc.args[2], uc.args[3])); break; case UCALL_DONE: done = true; @@ -740,7 +777,8 @@ static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encodin uint64_t observed; observed = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding)); - TEST_ASSERT_EQ(test_reg_vals[idx], observed); + TEST_ASSERT_EQ(reset_mutable_bits(encoding, test_reg_vals[idx]), + reset_mutable_bits(encoding, observed)); } static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) diff --git a/tools/testing/selftests/kvm/arm64/vgic_v5.c b/tools/testing/selftests/kvm/arm64/vgic_v5.c new file mode 100644 index 000000000000..3ce6cf37a629 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/vgic_v5.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vgic.h" + +#define NR_VCPUS 1 + +struct vm_gic { + struct kvm_vm *vm; + int gic_fd; + uint32_t gic_dev_type; +}; + +static uint64_t max_phys_size; + +#define GUEST_CMD_IRQ_CDIA 10 +#define GUEST_CMD_IRQ_DIEOI 11 +#define GUEST_CMD_IS_AWAKE 12 +#define GUEST_CMD_IS_READY 13 + +static void guest_irq_handler(struct ex_regs *regs) +{ + bool valid; + u32 hwirq; + u64 ia; + static int count; + + /* + * We have pending interrupts. Should never actually enter WFI + * here! + */ + wfi(); + GUEST_SYNC(GUEST_CMD_IS_AWAKE); + + ia = gicr_insn(CDIA); + valid = GICV5_GICR_CDIA_VALID(ia); + + GUEST_SYNC(GUEST_CMD_IRQ_CDIA); + + if (!valid) + return; + + gsb_ack(); + isb(); + + hwirq = FIELD_GET(GICV5_GICR_CDIA_INTID, ia); + + gic_insn(hwirq, CDDI); + gic_insn(0, CDEOI); + + GUEST_SYNC(GUEST_CMD_IRQ_DIEOI); + + if (++count >= 2) + GUEST_DONE(); + + /* Ask for the next interrupt to be injected */ + GUEST_SYNC(GUEST_CMD_IS_READY); +} + +static void guest_code(void) +{ + local_irq_disable(); + + gicv5_cpu_enable_interrupts(); + local_irq_enable(); + + /* Enable the SW_PPI (3) */ + write_sysreg_s(BIT_ULL(3), SYS_ICC_PPI_ENABLER0_EL1); + + /* Ask for the first interrupt to be injected */ + GUEST_SYNC(GUEST_CMD_IS_READY); + + /* Loop forever waiting for interrupts */ + while (1); +} + + +/* we don't want to assert on run execution, hence that helper */ +static int run_vcpu(struct kvm_vcpu *vcpu) +{ + return __vcpu_run(vcpu) ? -errno : 0; +} + +static void vm_gic_destroy(struct vm_gic *v) +{ + close(v->gic_fd); + kvm_vm_free(v->vm); +} + +static void test_vgic_v5_ppis(uint32_t gic_dev_type) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct ucall uc; + u64 user_ppis[2]; + struct vm_gic v; + int ret, i; + + v.gic_dev_type = gic_dev_type; + v.vm = __vm_create(VM_SHAPE_DEFAULT, NR_VCPUS, 0); + + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); + + for (i = 0; i < NR_VCPUS; i++) + vcpus[i] = vm_vcpu_add(v.vm, i, guest_code); + + vm_init_descriptor_tables(v.vm); + vm_install_exception_handler(v.vm, VECTOR_IRQ_CURRENT, guest_irq_handler); + + for (i = 0; i < NR_VCPUS; i++) + vcpu_init_descriptor_tables(vcpus[i]); + + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + /* Read out the PPIs that user space is allowed to drive. */ + kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_USERSPACE_PPIS, &user_ppis); + + /* We should always be able to drive the SW_PPI. */ + TEST_ASSERT(user_ppis[0] & BIT(GICV5_ARCH_PPI_SW_PPI), + "SW_PPI is not drivable by userspace"); + + while (1) { + ret = run_vcpu(vcpus[0]); + + switch (get_ucall(vcpus[0], &uc)) { + case UCALL_SYNC: + /* + * The guest is ready for the next level change. Set + * high if ready, and lower if it has been consumed. + */ + if (uc.args[1] == GUEST_CMD_IS_READY || + uc.args[1] == GUEST_CMD_IRQ_DIEOI) { + u64 irq; + bool level = uc.args[1] == GUEST_CMD_IRQ_DIEOI ? 0 : 1; + + irq = FIELD_PREP(KVM_ARM_IRQ_NUM_MASK, 3); + irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT; + + _kvm_irq_line(v.vm, irq, level); + } else if (uc.args[1] == GUEST_CMD_IS_AWAKE) { + pr_info("Guest skipping WFI due to pending IRQ\n"); + } else if (uc.args[1] == GUEST_CMD_IRQ_CDIA) { + pr_info("Guest acknowledged IRQ\n"); + } + + continue; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + TEST_ASSERT(ret == 0, "Failed to test GICv5 PPIs"); + + vm_gic_destroy(&v); +} + +/* + * Returns 0 if it's possible to create GIC device of a given type (V5). + */ +int test_kvm_device(uint32_t gic_dev_type) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct vm_gic v; + int ret; + + v.vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus); + + /* try to create a non existing KVM device */ + ret = __kvm_test_create_device(v.vm, 0); + TEST_ASSERT(ret && errno == ENODEV, "unsupported device"); + + /* trial mode */ + ret = __kvm_test_create_device(v.vm, gic_dev_type); + if (ret) + return ret; + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); + + ret = __kvm_create_device(v.vm, gic_dev_type); + TEST_ASSERT(ret < 0 && errno == EEXIST, "create GIC device twice"); + + vm_gic_destroy(&v); + + return 0; +} + +void run_tests(uint32_t gic_dev_type) +{ + pr_info("Test VGICv5 PPIs\n"); + test_vgic_v5_ppis(gic_dev_type); +} + +int main(int ac, char **av) +{ + int ret; + int pa_bits; + + test_disable_default_vgic(); + + pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits; + max_phys_size = 1ULL << pa_bits; + + ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V5); + if (ret) { + pr_info("No GICv5 support; Not running GIC_v5 tests.\n"); + exit(KSFT_SKIP); + } + + pr_info("Running VGIC_V5 tests.\n"); + run_tests(KVM_DEV_TYPE_ARM_VGIC_V5); + + return 0; +} diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v5.h b/tools/testing/selftests/kvm/include/arm64/gic_v5.h new file mode 100644 index 000000000000..eb523d9277cf --- /dev/null +++ b/tools/testing/selftests/kvm/include/arm64/gic_v5.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __SELFTESTS_GIC_V5_H +#define __SELFTESTS_GIC_V5_H + +#include +#include + +#include + +#include "processor.h" + +/* + * Definitions for GICv5 instructions for the Current Domain + */ +#define GICV5_OP_GIC_CDAFF sys_insn(1, 0, 12, 1, 3) +#define GICV5_OP_GIC_CDDI sys_insn(1, 0, 12, 2, 0) +#define GICV5_OP_GIC_CDDIS sys_insn(1, 0, 12, 1, 0) +#define GICV5_OP_GIC_CDHM sys_insn(1, 0, 12, 2, 1) +#define GICV5_OP_GIC_CDEN sys_insn(1, 0, 12, 1, 1) +#define GICV5_OP_GIC_CDEOI sys_insn(1, 0, 12, 1, 7) +#define GICV5_OP_GIC_CDPEND sys_insn(1, 0, 12, 1, 4) +#define GICV5_OP_GIC_CDPRI sys_insn(1, 0, 12, 1, 2) +#define GICV5_OP_GIC_CDRCFG sys_insn(1, 0, 12, 1, 5) +#define GICV5_OP_GICR_CDIA sys_insn(1, 0, 12, 3, 0) +#define GICV5_OP_GICR_CDNMIA sys_insn(1, 0, 12, 3, 1) + +/* Definitions for GIC CDAFF */ +#define GICV5_GIC_CDAFF_IAFFID_MASK GENMASK_ULL(47, 32) +#define GICV5_GIC_CDAFF_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDAFF_IRM_MASK BIT_ULL(28) +#define GICV5_GIC_CDAFF_ID_MASK GENMASK_ULL(23, 0) + +/* Definitions for GIC CDDI */ +#define GICV5_GIC_CDDI_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDDI_ID_MASK GENMASK_ULL(23, 0) + +/* Definitions for GIC CDDIS */ +#define GICV5_GIC_CDDIS_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDDIS_TYPE(r) FIELD_GET(GICV5_GIC_CDDIS_TYPE_MASK, r) +#define GICV5_GIC_CDDIS_ID_MASK GENMASK_ULL(23, 0) +#define GICV5_GIC_CDDIS_ID(r) FIELD_GET(GICV5_GIC_CDDIS_ID_MASK, r) + +/* Definitions for GIC CDEN */ +#define GICV5_GIC_CDEN_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDEN_ID_MASK GENMASK_ULL(23, 0) + +/* Definitions for GIC CDHM */ +#define GICV5_GIC_CDHM_HM_MASK BIT_ULL(32) +#define GICV5_GIC_CDHM_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDHM_ID_MASK GENMASK_ULL(23, 0) + +/* Definitions for GIC CDPEND */ +#define GICV5_GIC_CDPEND_PENDING_MASK BIT_ULL(32) +#define GICV5_GIC_CDPEND_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDPEND_ID_MASK GENMASK_ULL(23, 0) + +/* Definitions for GIC CDPRI */ +#define GICV5_GIC_CDPRI_PRIORITY_MASK GENMASK_ULL(39, 35) +#define GICV5_GIC_CDPRI_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDPRI_ID_MASK GENMASK_ULL(23, 0) + +/* Definitions for GIC CDRCFG */ +#define GICV5_GIC_CDRCFG_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GIC_CDRCFG_ID_MASK GENMASK_ULL(23, 0) + +/* Definitions for GICR CDIA */ +#define GICV5_GICR_CDIA_VALID_MASK BIT_ULL(32) +#define GICV5_GICR_CDIA_VALID(r) FIELD_GET(GICV5_GICR_CDIA_VALID_MASK, r) +#define GICV5_GICR_CDIA_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GICR_CDIA_ID_MASK GENMASK_ULL(23, 0) +#define GICV5_GICR_CDIA_INTID GENMASK_ULL(31, 0) + +/* Definitions for GICR CDNMIA */ +#define GICV5_GICR_CDNMIA_VALID_MASK BIT_ULL(32) +#define GICV5_GICR_CDNMIA_VALID(r) FIELD_GET(GICV5_GICR_CDNMIA_VALID_MASK, r) +#define GICV5_GICR_CDNMIA_TYPE_MASK GENMASK_ULL(31, 29) +#define GICV5_GICR_CDNMIA_ID_MASK GENMASK_ULL(23, 0) + +#define gicr_insn(insn) read_sysreg_s(GICV5_OP_GICR_##insn) +#define gic_insn(v, insn) write_sysreg_s(v, GICV5_OP_GIC_##insn) + +#define __GIC_BARRIER_INSN(op0, op1, CRn, CRm, op2, Rt) \ + __emit_inst(0xd5000000 | \ + sys_insn((op0), (op1), (CRn), (CRm), (op2)) | \ + ((Rt) & 0x1f)) + +#define GSB_SYS_BARRIER_INSN __GIC_BARRIER_INSN(1, 0, 12, 0, 0, 31) +#define GSB_ACK_BARRIER_INSN __GIC_BARRIER_INSN(1, 0, 12, 0, 1, 31) + +#define gsb_ack() asm volatile(GSB_ACK_BARRIER_INSN : : : "memory") +#define gsb_sys() asm volatile(GSB_SYS_BARRIER_INSN : : : "memory") + +#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) + +#define GICV5_IRQ_DEFAULT_PRI 0b10000 + +#define GICV5_ARCH_PPI_SW_PPI 0x3 + +void gicv5_ppi_priority_init(void) +{ + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR0_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR1_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR2_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR3_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR4_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR5_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR6_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR7_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR8_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR9_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR10_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR11_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR12_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR13_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR14_EL1); + write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_DEFAULT_PRI), SYS_ICC_PPI_PRIORITYR15_EL1); + + /* + * Context syncronization required to make sure system register writes + * effects are synchronised. + */ + isb(); +} + +void gicv5_cpu_disable_interrupts(void) +{ + u64 cr0; + + cr0 = FIELD_PREP(ICC_CR0_EL1_EN, 0); + write_sysreg_s(cr0, SYS_ICC_CR0_EL1); +} + +void gicv5_cpu_enable_interrupts(void) +{ + u64 cr0, pcr; + + write_sysreg_s(0, SYS_ICC_PPI_ENABLER0_EL1); + write_sysreg_s(0, SYS_ICC_PPI_ENABLER1_EL1); + + gicv5_ppi_priority_init(); + + pcr = FIELD_PREP(ICC_PCR_EL1_PRIORITY, GICV5_IRQ_DEFAULT_PRI); + write_sysreg_s(pcr, SYS_ICC_PCR_EL1); + + cr0 = FIELD_PREP(ICC_CR0_EL1_EN, 1); + write_sysreg_s(cr0, SYS_ICC_CR0_EL1); +} + +#endif