From 6712fcde003f780d6241d755a0fa41ff4739b9a4 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 20 Feb 2026 17:12:28 +0800 Subject: [PATCH 001/109] arm64: remove ARCH_INLINE_* Since commit 7dadeaa6e851 ("sched: Further restrict the preemption modes"), arm64 only has two preemption models: full and lazy. Both implies PREEMPTION, so !PREEMPTION is always false for arm64, it's time to remove ARCH_INLINE_*. Signed-off-by: Jisheng Zhang Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 38dba5f7e4d2..3d90f2d35a99 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -61,32 +61,6 @@ config ARM64 select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_TRACE_MMIO_ACCESS - select ARCH_INLINE_READ_LOCK if !PREEMPTION - select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION - select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION - select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION - select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION - select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_USE_CMPXCHG_LOCKREF From 3ce8f5860ff478d23af87bd459e76e466b483af3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 13 Mar 2026 12:32:19 +0000 Subject: [PATCH 002/109] arm64: scs: Remove redundant save/restore of SCS SP on entry to/from EL0 When returning to userspace, the SCS is empty and so the SCS SP just points to the base address of the SCS page. Rather than saving and restoring this address in the current task, we can simply restore the SCS SP to point at the base of the stack on entry to EL1 from EL0. Cc: Ard Biesheuvel Cc: Mark Rutland Signed-off-by: Will Deacon Acked-by: Mark Rutland Acked-by: Ard Biesheuvel Reviewed-by: Sami Tolvanen Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/scs.h | 8 ++++++++ arch/arm64/kernel/entry.S | 4 +--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index 0fbc2e7867d3..a15a2968e7b6 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -10,6 +10,11 @@ #ifdef CONFIG_SHADOW_CALL_STACK scs_sp .req x18 + .macro scs_load_current_base + get_current_task scs_sp + ldr scs_sp, [scs_sp, #TSK_TI_SCS_BASE] + .endm + .macro scs_load_current get_current_task scs_sp ldr scs_sp, [scs_sp, #TSK_TI_SCS_SP] @@ -19,6 +24,9 @@ str scs_sp, [\tsk, #TSK_TI_SCS_SP] .endm #else + .macro scs_load_current_base + .endm + .macro scs_load_current .endm diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f8018b5c1f9a..ab476ba060d1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -273,7 +273,7 @@ alternative_if ARM64_HAS_ADDRESS_AUTH alternative_else_nop_endif 1: - scs_load_current + scs_load_current_base .else add x21, sp, #PT_REGS_SIZE get_current_task tsk @@ -378,8 +378,6 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: - scs_save tsk - /* Ignore asynchronous tag check faults in the uaccess routines */ ldr x0, [tsk, THREAD_SCTLR_USER] clear_mte_async_tcf x0 From 5b3fb8a6b429c33ee669d08b1a883d881e9614a1 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:48 +0000 Subject: [PATCH 003/109] arm64: mm: Re-implement the __tlbi_level macro as a C function As part of efforts to reduce our reliance on complex preprocessor macros for TLB invalidation routines, convert the __tlbi_level macro to a C function for by-level TLB invalidation. Each specific tlbi level op is implemented as a C function and the appropriate function pointer is passed to __tlbi_level(). Since everything is declared inline and is statically resolvable, the compiler will convert the indirect function call to a direct inline execution. Suggested-by: Linus Torvalds Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 67 +++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 1416e652612b..a0e3ebe29986 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -97,19 +97,60 @@ static inline unsigned long get_trans_granule(void) #define TLBI_TTL_UNKNOWN INT_MAX -#define __tlbi_level(op, addr, level) do { \ - u64 arg = addr; \ - \ - if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && \ - level >= 0 && level <= 3) { \ - u64 ttl = level & 3; \ - ttl |= get_trans_granule() << 2; \ - arg &= ~TLBI_TTL_MASK; \ - arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ - } \ - \ - __tlbi(op, arg); \ -} while(0) +typedef void (*tlbi_op)(u64 arg); + +static __always_inline void vae1is(u64 arg) +{ + __tlbi(vae1is, arg); +} + +static __always_inline void vae2is(u64 arg) +{ + __tlbi(vae2is, arg); +} + +static __always_inline void vale1(u64 arg) +{ + __tlbi(vale1, arg); +} + +static __always_inline void vale1is(u64 arg) +{ + __tlbi(vale1is, arg); +} + +static __always_inline void vale2is(u64 arg) +{ + __tlbi(vale2is, arg); +} + +static __always_inline void vaale1is(u64 arg) +{ + __tlbi(vaale1is, arg); +} + +static __always_inline void ipas2e1(u64 arg) +{ + __tlbi(ipas2e1, arg); +} + +static __always_inline void ipas2e1is(u64 arg) +{ + __tlbi(ipas2e1is, arg); +} + +static __always_inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) +{ + u64 arg = addr; + + if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && level <= 3) { + u64 ttl = level | (get_trans_granule() << 2); + + FIELD_MODIFY(TLBI_TTL_MASK, &arg, ttl); + } + + op(arg); +} #define __tlbi_user_level(op, arg, level) do { \ if (arm64_kernel_unmapped_at_el0()) \ From d2bf3226952c64d1c2ce4995cce60b3fb8ae5f33 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:49 +0000 Subject: [PATCH 004/109] arm64: mm: Introduce a C wrapper for by-range TLB invalidation As part of efforts to reduce our reliance on complex preprocessor macros for TLB invalidation routines, introduce a new C wrapper for by-range TLB invalidation which can be used instead of the __tlbi() macro and can additionally be called from C code. Each specific tlbi range op is implemented as a C function and the appropriate function pointer is passed to __tlbi_range(). Since everything is declared inline and is statically resolvable, the compiler will convert the indirect function call to a direct inline execution. Suggested-by: Linus Torvalds Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 32 ++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a0e3ebe29986..b3b86e5f7034 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -468,6 +468,36 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * operations can only span an even number of pages. We save this for last to * ensure 64KB start alignment is maintained for the LPA2 case. */ +static __always_inline void rvae1is(u64 arg) +{ + __tlbi(rvae1is, arg); +} + +static __always_inline void rvale1(u64 arg) +{ + __tlbi(rvale1, arg); +} + +static __always_inline void rvale1is(u64 arg) +{ + __tlbi(rvale1is, arg); +} + +static __always_inline void rvaale1is(u64 arg) +{ + __tlbi(rvaale1is, arg); +} + +static __always_inline void ripas2e1is(u64 arg) +{ + __tlbi(ripas2e1is, arg); +} + +static __always_inline void __tlbi_range(tlbi_op op, u64 arg) +{ + op(arg); +} + #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user, lpa2) \ do { \ @@ -495,7 +525,7 @@ do { \ if (num >= 0) { \ addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \ scale, num, tlb_level); \ - __tlbi(r##op, addr); \ + __tlbi_range(r##op, addr); \ if (tlbi_user) \ __tlbi_user(r##op, addr); \ __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ From edc55b7abb2547aac5521202b029dc4dd2054771 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:50 +0000 Subject: [PATCH 005/109] arm64: mm: Implicitly invalidate user ASID based on TLBI operation When kpti is enabled, separate ASIDs are used for userspace and kernelspace, requiring ASID-qualified TLB invalidation by virtual address to invalidate both of them. Push the logic for invalidating the two ASIDs down into the low-level tlbi-op-specific functions and remove the burden from the caller to handle the kpti-specific behaviour. Co-developed-by: Will Deacon Signed-off-by: Will Deacon Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index b3b86e5f7034..e586d9b71ea2 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -102,6 +102,7 @@ typedef void (*tlbi_op)(u64 arg); static __always_inline void vae1is(u64 arg) { __tlbi(vae1is, arg); + __tlbi_user(vae1is, arg); } static __always_inline void vae2is(u64 arg) @@ -112,11 +113,13 @@ static __always_inline void vae2is(u64 arg) static __always_inline void vale1(u64 arg) { __tlbi(vale1, arg); + __tlbi_user(vale1, arg); } static __always_inline void vale1is(u64 arg) { __tlbi(vale1is, arg); + __tlbi_user(vale1is, arg); } static __always_inline void vale2is(u64 arg) @@ -152,11 +155,6 @@ static __always_inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) op(arg); } -#define __tlbi_user_level(op, arg, level) do { \ - if (arm64_kernel_unmapped_at_el0()) \ - __tlbi_level(op, (arg | USER_ASID_FLAG), level); \ -} while (0) - /* * This macro creates a properly formatted VA operand for the TLB RANGE. The * value bit assignments are: @@ -444,8 +442,6 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * @stride: Flush granularity * @asid: The ASID of the task (0 for IPA instructions) * @tlb_level: Translation Table level hint, if known - * @tlbi_user: If 'true', call an additional __tlbi_user() - * (typically for user ASIDs). 'flase' for IPA instructions * @lpa2: If 'true', the lpa2 scheme is used as set out below * * When the CPU does not support TLB range operations, flush the TLB @@ -471,16 +467,19 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) static __always_inline void rvae1is(u64 arg) { __tlbi(rvae1is, arg); + __tlbi_user(rvae1is, arg); } static __always_inline void rvale1(u64 arg) { __tlbi(rvale1, arg); + __tlbi_user(rvale1, arg); } static __always_inline void rvale1is(u64 arg) { __tlbi(rvale1is, arg); + __tlbi_user(rvale1is, arg); } static __always_inline void rvaale1is(u64 arg) @@ -499,7 +498,7 @@ static __always_inline void __tlbi_range(tlbi_op op, u64 arg) } #define __flush_tlb_range_op(op, start, pages, stride, \ - asid, tlb_level, tlbi_user, lpa2) \ + asid, tlb_level, lpa2) \ do { \ typeof(start) __flush_start = start; \ typeof(pages) __flush_pages = pages; \ @@ -514,8 +513,6 @@ do { \ (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \ addr = __TLBI_VADDR(__flush_start, asid); \ __tlbi_level(op, addr, tlb_level); \ - if (tlbi_user) \ - __tlbi_user_level(op, addr, tlb_level); \ __flush_start += stride; \ __flush_pages -= stride >> PAGE_SHIFT; \ continue; \ @@ -526,8 +523,6 @@ do { \ addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \ scale, num, tlb_level); \ __tlbi_range(r##op, addr); \ - if (tlbi_user) \ - __tlbi_user(r##op, addr); \ __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\ } \ @@ -536,7 +531,7 @@ do { \ } while (0) #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ - __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); + __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled()); static inline bool __flush_tlb_range_limit_excess(unsigned long start, unsigned long end, unsigned long pages, unsigned long stride) @@ -576,10 +571,10 @@ static inline void __flush_tlb_range_nosync(struct mm_struct *mm, if (last_level) __flush_tlb_range_op(vale1is, start, pages, stride, asid, - tlb_level, true, lpa2_is_enabled()); + tlb_level, lpa2_is_enabled()); else __flush_tlb_range_op(vae1is, start, pages, stride, asid, - tlb_level, true, lpa2_is_enabled()); + tlb_level, lpa2_is_enabled()); mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } @@ -604,7 +599,7 @@ static inline void local_flush_tlb_contpte(struct vm_area_struct *vma, dsb(nshst); asid = ASID(vma->vm_mm); __flush_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid, - 3, true, lpa2_is_enabled()); + 3, lpa2_is_enabled()); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr, addr + CONT_PTE_SIZE); dsb(nsh); @@ -638,7 +633,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end dsb(ishst); __flush_tlb_range_op(vaale1is, start, pages, stride, 0, - TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); + TLBI_TTL_UNKNOWN, lpa2_is_enabled()); __tlbi_sync_s1ish(); isb(); } @@ -689,6 +684,7 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #define huge_pmd_needs_flush huge_pmd_needs_flush +#undef __tlbi_user #endif #endif From a3710035604fdeace5c0945d98955a1624e4648d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 2 Mar 2026 13:55:51 +0000 Subject: [PATCH 006/109] arm64: mm: Push __TLBI_VADDR() into __tlbi_level() The __TLBI_VADDR() macro takes an ASID and an address and converts them into a single argument formatted correctly for a TLB invalidation instruction. Rather than have callers worry about this (especially in the case where the ASID is zero), push the macro down into __tlbi_level() via a new __tlbi_level_asid() helper. Signed-off-by: Will Deacon Reviewed-by: Linu Cherian Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 14 ++++++++++---- arch/arm64/kernel/sys_compat.c | 2 +- arch/arm64/kvm/hyp/nvhe/mm.c | 2 +- arch/arm64/kvm/hyp/nvhe/tlb.c | 2 -- arch/arm64/kvm/hyp/pgtable.c | 4 ++-- arch/arm64/kvm/hyp/vhe/tlb.c | 2 -- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index e586d9b71ea2..2832305606b7 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -142,9 +142,10 @@ static __always_inline void ipas2e1is(u64 arg) __tlbi(ipas2e1is, arg); } -static __always_inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) +static __always_inline void __tlbi_level_asid(tlbi_op op, u64 addr, u32 level, + u16 asid) { - u64 arg = addr; + u64 arg = __TLBI_VADDR(addr, asid); if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && level <= 3) { u64 ttl = level | (get_trans_granule() << 2); @@ -155,6 +156,11 @@ static __always_inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) op(arg); } +static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) +{ + __tlbi_level_asid(op, addr, level, 0); +} + /* * This macro creates a properly formatted VA operand for the TLB RANGE. The * value bit assignments are: @@ -511,8 +517,7 @@ do { \ if (!system_supports_tlb_range() || \ __flush_pages == 1 || \ (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \ - addr = __TLBI_VADDR(__flush_start, asid); \ - __tlbi_level(op, addr, tlb_level); \ + __tlbi_level_asid(op, __flush_start, tlb_level, asid); \ __flush_start += stride; \ __flush_pages -= stride >> PAGE_SHIFT; \ continue; \ @@ -685,6 +690,7 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) #define huge_pmd_needs_flush huge_pmd_needs_flush #undef __tlbi_user +#undef __TLBI_VADDR #endif #endif diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index b9d4998c97ef..7e9860143add 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -36,7 +36,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) * The workaround requires an inner-shareable tlbi. * We pick the reserved-ASID to minimise the impact. */ - __tlbi(aside1is, __TLBI_VADDR(0, 0)); + __tlbi(aside1is, 0UL); __tlbi_sync_s1ish(); } diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 218976287d3f..4d8fcc7a3a41 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -270,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); + __tlbi_level(vale2is, addr, level); __tlbi_sync_s1ish_hyp(); isb(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 3dc1ce0d27fe..b29140995d48 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -158,7 +158,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1is, ipa, level); /* @@ -188,7 +187,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1, ipa, level); /* diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9b480f947da2..30226f2d5564 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -490,14 +490,14 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); + __tlbi_level(vae2is, ctx->addr, TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vale2is, ctx->addr, ctx->level); *unmapped += granule; } diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 35855dadfb1b..f7b9dfe3f3a5 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -104,7 +104,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1is, ipa, level); /* @@ -136,7 +135,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1, ipa, level); /* From d4b048ca145fd31a2c6f77ae4db49e51bdec58ac Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 2 Mar 2026 13:55:52 +0000 Subject: [PATCH 007/109] arm64: mm: Inline __TLBI_VADDR_RANGE() into __tlbi_range() The __TLBI_VADDR_RANGE() macro is only used in one place and isn't something that's generally useful outside of the low-level range invalidation gubbins. Inline __TLBI_VADDR_RANGE() into the __tlbi_range() function so that the macro can be removed entirely. Signed-off-by: Will Deacon Reviewed-by: Linu Cherian Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 32 +++++++++++++------------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 2832305606b7..0f81547470ea 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -186,19 +186,6 @@ static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) #define TLBIR_TTL_MASK GENMASK_ULL(38, 37) #define TLBIR_BADDR_MASK GENMASK_ULL(36, 0) -#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \ - ({ \ - unsigned long __ta = 0; \ - unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ - __ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr); \ - __ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl); \ - __ta |= FIELD_PREP(TLBIR_NUM_MASK, num); \ - __ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale); \ - __ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); \ - __ta |= FIELD_PREP(TLBIR_ASID_MASK, asid); \ - __ta; \ - }) - /* These macros are used by the TLBI RANGE feature. */ #define __TLBI_RANGE_PAGES(num, scale) \ ((unsigned long)((num) + 1) << (5 * (scale) + 1)) @@ -498,8 +485,19 @@ static __always_inline void ripas2e1is(u64 arg) __tlbi(ripas2e1is, arg); } -static __always_inline void __tlbi_range(tlbi_op op, u64 arg) +static __always_inline void __tlbi_range(tlbi_op op, u64 addr, + u16 asid, int scale, int num, + u32 level, bool lpa2) { + u64 arg = 0; + + arg |= FIELD_PREP(TLBIR_BADDR_MASK, addr >> (lpa2 ? 16 : PAGE_SHIFT)); + arg |= FIELD_PREP(TLBIR_TTL_MASK, level > 3 ? 0 : level); + arg |= FIELD_PREP(TLBIR_NUM_MASK, num); + arg |= FIELD_PREP(TLBIR_SCALE_MASK, scale); + arg |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); + arg |= FIELD_PREP(TLBIR_ASID_MASK, asid); + op(arg); } @@ -510,8 +508,6 @@ do { \ typeof(pages) __flush_pages = pages; \ int num = 0; \ int scale = 3; \ - int shift = lpa2 ? 16 : PAGE_SHIFT; \ - unsigned long addr; \ \ while (__flush_pages > 0) { \ if (!system_supports_tlb_range() || \ @@ -525,9 +521,7 @@ do { \ \ num = __TLBI_RANGE_NUM(__flush_pages, scale); \ if (num >= 0) { \ - addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \ - scale, num, tlb_level); \ - __tlbi_range(r##op, addr); \ + __tlbi_range(r##op, __flush_start, asid, scale, num, tlb_level, lpa2); \ __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\ } \ From 5e63b73f3deb1e0f7e2234f7f6743e323b60daf8 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:53 +0000 Subject: [PATCH 008/109] arm64: mm: Re-implement the __flush_tlb_range_op macro in C The __flush_tlb_range_op() macro is horrible and has been a previous source of bugs thanks to multiple expansions of its arguments (see commit f7edb07ad7c6 ("Fix mmu notifiers for range-based invalidates")). Rewrite the thing in C. Suggested-by: Linus Torvalds Co-developed-by: Will Deacon Signed-off-by: Will Deacon Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 84 +++++++++++++++++-------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 0f81547470ea..3c05afdbe3a6 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -429,12 +429,13 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) /* * __flush_tlb_range_op - Perform TLBI operation upon a range * - * @op: TLBI instruction that operates on a range (has 'r' prefix) + * @lop: TLBI level operation to perform + * @rop: TLBI range operation to perform * @start: The start address of the range * @pages: Range as the number of pages from 'start' * @stride: Flush granularity * @asid: The ASID of the task (0 for IPA instructions) - * @tlb_level: Translation Table level hint, if known + * @level: Translation Table level hint, if known * @lpa2: If 'true', the lpa2 scheme is used as set out below * * When the CPU does not support TLB range operations, flush the TLB @@ -501,36 +502,44 @@ static __always_inline void __tlbi_range(tlbi_op op, u64 addr, op(arg); } -#define __flush_tlb_range_op(op, start, pages, stride, \ - asid, tlb_level, lpa2) \ -do { \ - typeof(start) __flush_start = start; \ - typeof(pages) __flush_pages = pages; \ - int num = 0; \ - int scale = 3; \ - \ - while (__flush_pages > 0) { \ - if (!system_supports_tlb_range() || \ - __flush_pages == 1 || \ - (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \ - __tlbi_level_asid(op, __flush_start, tlb_level, asid); \ - __flush_start += stride; \ - __flush_pages -= stride >> PAGE_SHIFT; \ - continue; \ - } \ - \ - num = __TLBI_RANGE_NUM(__flush_pages, scale); \ - if (num >= 0) { \ - __tlbi_range(r##op, __flush_start, asid, scale, num, tlb_level, lpa2); \ - __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ - __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\ - } \ - scale--; \ - } \ -} while (0) +static __always_inline void __flush_tlb_range_op(tlbi_op lop, tlbi_op rop, + u64 start, size_t pages, + u64 stride, u16 asid, + u32 level, bool lpa2) +{ + u64 addr = start, end = start + pages * PAGE_SIZE; + int scale = 3; + + while (addr != end) { + int num; + + pages = (end - addr) >> PAGE_SHIFT; + + if (!system_supports_tlb_range() || pages == 1) + goto invalidate_one; + + if (lpa2 && !IS_ALIGNED(addr, SZ_64K)) + goto invalidate_one; + + num = __TLBI_RANGE_NUM(pages, scale); + if (num >= 0) { + __tlbi_range(rop, addr, asid, scale, num, level, lpa2); + addr += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; + } + + scale--; + continue; +invalidate_one: + __tlbi_level_asid(lop, addr, level, asid); + addr += stride; + } +} + +#define __flush_s1_tlb_range_op(op, start, pages, stride, asid, tlb_level) \ + __flush_tlb_range_op(op, r##op, start, pages, stride, asid, tlb_level, lpa2_is_enabled()) #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ - __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled()); + __flush_tlb_range_op(op, r##op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled()) static inline bool __flush_tlb_range_limit_excess(unsigned long start, unsigned long end, unsigned long pages, unsigned long stride) @@ -569,11 +578,11 @@ static inline void __flush_tlb_range_nosync(struct mm_struct *mm, asid = ASID(mm); if (last_level) - __flush_tlb_range_op(vale1is, start, pages, stride, asid, - tlb_level, lpa2_is_enabled()); + __flush_s1_tlb_range_op(vale1is, start, pages, stride, + asid, tlb_level); else - __flush_tlb_range_op(vae1is, start, pages, stride, asid, - tlb_level, lpa2_is_enabled()); + __flush_s1_tlb_range_op(vae1is, start, pages, stride, + asid, tlb_level); mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } @@ -597,8 +606,7 @@ static inline void local_flush_tlb_contpte(struct vm_area_struct *vma, dsb(nshst); asid = ASID(vma->vm_mm); - __flush_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid, - 3, lpa2_is_enabled()); + __flush_s1_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid, 3); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr, addr + CONT_PTE_SIZE); dsb(nsh); @@ -631,8 +639,8 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end } dsb(ishst); - __flush_tlb_range_op(vaale1is, start, pages, stride, 0, - TLBI_TTL_UNKNOWN, lpa2_is_enabled()); + __flush_s1_tlb_range_op(vaale1is, start, pages, stride, 0, + TLBI_TTL_UNKNOWN); __tlbi_sync_s1ish(); isb(); } From 057bbd8e06102100023b980dfdd26f8a595785cd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 2 Mar 2026 13:55:54 +0000 Subject: [PATCH 009/109] arm64: mm: Simplify __TLBI_RANGE_NUM() macro Since commit e2768b798a19 ("arm64/mm: Modify range-based tlbi to decrement scale"), we don't need to clamp the 'pages' argument to fit the range for the specified 'scale' as we know that the upper bits will have been processed in a prior iteration. Drop the clamping and simplify the __TLBI_RANGE_NUM() macro. Signed-off-by: Will Deacon Reviewed-by: Ryan Roberts Reviewed-by: Dev Jain Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3c05afdbe3a6..fb7e541cfdfd 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -199,11 +199,7 @@ static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) * range. */ #define __TLBI_RANGE_NUM(pages, scale) \ - ({ \ - int __pages = min((pages), \ - __TLBI_RANGE_PAGES(31, (scale))); \ - (__pages >> (5 * (scale) + 1)) - 1; \ - }) + (((pages) >> (5 * (scale) + 1)) - 1) #define __repeat_tlbi_sync(op, arg...) \ do { \ From c753d667d9596866e0b9608f52c91b5a540786cb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 2 Mar 2026 13:55:55 +0000 Subject: [PATCH 010/109] arm64: mm: Simplify __flush_tlb_range_limit_excess() __flush_tlb_range_limit_excess() is unnecessarily complicated: - It takes a 'start', 'end' and 'pages' argument, whereas it only needs 'pages' (which the caller has computed from the other two arguments!). - It erroneously compares 'pages' with MAX_TLBI_RANGE_PAGES when the system doesn't support range-based invalidation but the range to be invalidated would result in fewer than MAX_DVM_OPS invalidations. Simplify the function so that it no longer takes the 'start' and 'end' arguments and only considers the MAX_TLBI_RANGE_PAGES threshold on systems that implement range-based invalidation. Signed-off-by: Will Deacon Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index fb7e541cfdfd..fd86647db24b 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -537,21 +537,19 @@ invalidate_one: #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, r##op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled()) -static inline bool __flush_tlb_range_limit_excess(unsigned long start, - unsigned long end, unsigned long pages, unsigned long stride) +static inline bool __flush_tlb_range_limit_excess(unsigned long pages, + unsigned long stride) { /* - * When the system does not support TLB range based flush - * operation, (MAX_DVM_OPS - 1) pages can be handled. But - * with TLB range based operation, MAX_TLBI_RANGE_PAGES - * pages can be handled. + * Assume that the worst case number of DVM ops required to flush a + * given range on a system that supports tlb-range is 20 (4 scales, 1 + * final page, 15 for alignment on LPA2 systems), which is much smaller + * than MAX_DVM_OPS. */ - if ((!system_supports_tlb_range() && - (end - start) >= (MAX_DVM_OPS * stride)) || - pages > MAX_TLBI_RANGE_PAGES) - return true; + if (system_supports_tlb_range()) + return pages > MAX_TLBI_RANGE_PAGES; - return false; + return pages >= (MAX_DVM_OPS * stride) >> PAGE_SHIFT; } static inline void __flush_tlb_range_nosync(struct mm_struct *mm, @@ -565,7 +563,7 @@ static inline void __flush_tlb_range_nosync(struct mm_struct *mm, end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; - if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { + if (__flush_tlb_range_limit_excess(pages, stride)) { flush_tlb_mm(mm); return; } @@ -629,7 +627,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; - if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { + if (__flush_tlb_range_limit_excess(pages, stride)) { flush_tlb_all(); return; } From 64212d689306df27fb12363df2aab030cf767a4a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:56 +0000 Subject: [PATCH 011/109] arm64: mm: Refactor flush_tlb_page() to use __tlbi_level_asid() Now that we have __tlbi_level_asid(), let's refactor the *flush_tlb_page*() variants to use it rather than open coding. The emitted tlbi(s) is/are intended to be exactly the same as before; no TTL hint is provided. Although the spec for flush_tlb_page() allows for setting the TTL hint to 3, it turns out that flush_tlb_fix_spurious_fault_pmd() depends on local_flush_tlb_page_nonotify() to invalidate the level 2 entry. This will be fixed separately. Reviewed-by: Linu Cherian Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index fd86647db24b..0a49a25a4fdc 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -346,12 +346,8 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void __local_flush_tlb_page_nonotify_nosync(struct mm_struct *mm, unsigned long uaddr) { - unsigned long addr; - dsb(nshst); - addr = __TLBI_VADDR(uaddr, ASID(mm)); - __tlbi(vale1, addr); - __tlbi_user(vale1, addr); + __tlbi_level_asid(vale1, uaddr, TLBI_TTL_UNKNOWN, ASID(mm)); } static inline void local_flush_tlb_page_nonotify(struct vm_area_struct *vma, @@ -373,12 +369,8 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, static inline void __flush_tlb_page_nosync(struct mm_struct *mm, unsigned long uaddr) { - unsigned long addr; - dsb(ishst); - addr = __TLBI_VADDR(uaddr, ASID(mm)); - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); + __tlbi_level_asid(vale1is, uaddr, TLBI_TTL_UNKNOWN, ASID(mm)); mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, (uaddr & PAGE_MASK) + PAGE_SIZE); } From 11f6dd8dd2838703fe3d149118b8c785150cc92f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:57 +0000 Subject: [PATCH 012/109] arm64: mm: Refactor __flush_tlb_range() to take flags We have function variants with "_nosync", "_local", "_nonotify" as well as the "last_level" parameter. Let's generalize and simplify by using a flags parameter to encode all these variants. As a first step, convert the "last_level" boolean parameter to a flags parameter and create the first flag, TLBF_NOWALKCACHE. When present, walk cache entries are not evicted, which is the same as the old last_level=true. Reviewed-by: Linu Cherian Reviewed-by: Jonathan Cameron Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hugetlb.h | 12 ++++++------ arch/arm64/include/asm/pgtable.h | 4 ++-- arch/arm64/include/asm/tlb.h | 6 +++--- arch/arm64/include/asm/tlbflush.h | 32 +++++++++++++++++++------------ arch/arm64/mm/contpte.c | 5 +++-- arch/arm64/mm/hugetlbpage.c | 4 ++-- arch/arm64/mm/mmu.c | 2 +- 7 files changed, 37 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index e6f8ff3cc630..d038ff14d16c 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -71,23 +71,23 @@ static inline void __flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, - bool last_level) + tlbf_t flags) { switch (stride) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - __flush_tlb_range(vma, start, end, PUD_SIZE, last_level, 1); + __flush_tlb_range(vma, start, end, PUD_SIZE, 1, flags); break; #endif case CONT_PMD_SIZE: case PMD_SIZE: - __flush_tlb_range(vma, start, end, PMD_SIZE, last_level, 2); + __flush_tlb_range(vma, start, end, PMD_SIZE, 2, flags); break; case CONT_PTE_SIZE: - __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, 3); + __flush_tlb_range(vma, start, end, PAGE_SIZE, 3, flags); break; default: - __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, TLBI_TTL_UNKNOWN); + __flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, flags); } } @@ -98,7 +98,7 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, { unsigned long stride = huge_page_size(hstate_vma(vma)); - __flush_hugetlb_tlb_range(vma, start, end, stride, false); + __flush_hugetlb_tlb_range(vma, start, end, stride, TLBF_NONE); } #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b3e58735c49b..88bb9275ac89 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -89,9 +89,9 @@ static inline void arch_leave_lazy_mmu_mode(void) /* Set stride and tlb_level in flush_*_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) \ - __flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2) + __flush_tlb_range(vma, addr, end, PMD_SIZE, 2, TLBF_NONE) #define flush_pud_tlb_range(vma, addr, end) \ - __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) + __flush_tlb_range(vma, addr, end, PUD_SIZE, 1, TLBF_NONE) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 8d762607285c..10869d7731b8 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -53,7 +53,7 @@ static inline int tlb_get_level(struct mmu_gather *tlb) static inline void tlb_flush(struct mmu_gather *tlb) { struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); - bool last_level = !tlb->freed_tables; + tlbf_t flags = tlb->freed_tables ? TLBF_NONE : TLBF_NOWALKCACHE; unsigned long stride = tlb_get_unmap_size(tlb); int tlb_level = tlb_get_level(tlb); @@ -63,13 +63,13 @@ static inline void tlb_flush(struct mmu_gather *tlb) * reallocate our ASID without invalidating the entire TLB. */ if (tlb->fullmm) { - if (!last_level) + if (tlb->freed_tables) flush_tlb_mm(tlb->mm); return; } __flush_tlb_range(&vma, tlb->start, tlb->end, stride, - last_level, tlb_level); + tlb_level, flags); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 0a49a25a4fdc..d134824ea5da 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -286,16 +286,16 @@ static inline void __tlbi_sync_s1ish_hyp(void) * CPUs, ensuring that any walk-cache entries associated with the * translation are also invalidated. * - * __flush_tlb_range(vma, start, end, stride, last_level, tlb_level) + * __flush_tlb_range(vma, start, end, stride, tlb_level, flags) * Invalidate the virtual-address range '[start, end)' on all * CPUs for the user address space corresponding to 'vma->mm'. * The invalidation operations are issued at a granularity - * determined by 'stride' and only affect any walk-cache entries - * if 'last_level' is equal to false. tlb_level is the level at + * determined by 'stride'. tlb_level is the level at * which the invalidation must take place. If the level is wrong, * no invalidation may take place. In the case where the level * cannot be easily determined, the value TLBI_TTL_UNKNOWN will - * perform a non-hinted invalidation. + * perform a non-hinted invalidation. flags may be TLBF_NONE (0) or + * TLBF_NOWALKCACHE (elide eviction of walk cache entries). * * local_flush_tlb_page(vma, addr) * Local variant of flush_tlb_page(). Stale TLB entries may @@ -544,10 +544,18 @@ static inline bool __flush_tlb_range_limit_excess(unsigned long pages, return pages >= (MAX_DVM_OPS * stride) >> PAGE_SHIFT; } +typedef unsigned __bitwise tlbf_t; + +/* No special behaviour. */ +#define TLBF_NONE ((__force tlbf_t)0) + +/* Invalidate tlb entries only, leaving the page table walk cache intact. */ +#define TLBF_NOWALKCACHE ((__force tlbf_t)BIT(0)) + static inline void __flush_tlb_range_nosync(struct mm_struct *mm, unsigned long start, unsigned long end, - unsigned long stride, bool last_level, - int tlb_level) + unsigned long stride, int tlb_level, + tlbf_t flags) { unsigned long asid, pages; @@ -563,7 +571,7 @@ static inline void __flush_tlb_range_nosync(struct mm_struct *mm, dsb(ishst); asid = ASID(mm); - if (last_level) + if (flags & TLBF_NOWALKCACHE) __flush_s1_tlb_range_op(vale1is, start, pages, stride, asid, tlb_level); else @@ -575,11 +583,11 @@ static inline void __flush_tlb_range_nosync(struct mm_struct *mm, static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long stride, bool last_level, - int tlb_level) + unsigned long stride, int tlb_level, + tlbf_t flags) { __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, - last_level, tlb_level); + tlb_level, flags); __tlbi_sync_s1ish(); } @@ -607,7 +615,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, * Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough * information here. */ - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); + __flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, TLBF_NONE); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -648,7 +656,7 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm, unsigned long start, unsigned long end) { - __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3); + __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, 3, TLBF_NOWALKCACHE); } static inline bool __pte_flags_need_flush(ptdesc_t oldval, ptdesc_t newval) diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 1519d090d5ea..5f2a7b43ca32 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -225,7 +225,8 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr, */ if (!system_supports_bbml2_noabort()) - __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, 3, + TLBF_NOWALKCACHE); __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } @@ -552,7 +553,7 @@ int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, * eliding the trailing DSB applies here. */ __flush_tlb_range_nosync(vma->vm_mm, addr, end, - PAGE_SIZE, true, 3); + PAGE_SIZE, 3, TLBF_NOWALKCACHE); } return young; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index a42c05cf5640..0b7ccd0cbb9e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -181,7 +181,7 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm, struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); unsigned long end = addr + (pgsize * ncontig); - __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true); + __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, TLBF_NOWALKCACHE); return orig_pte; } @@ -209,7 +209,7 @@ static void clear_flush(struct mm_struct *mm, if (mm == &init_mm) flush_tlb_kernel_range(saddr, addr); else - __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true); + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, TLBF_NOWALKCACHE); } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a6a00accf4f9..054df431846f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -2149,7 +2149,7 @@ pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, */ if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) __flush_tlb_range(vma, addr, nr * PAGE_SIZE, - PAGE_SIZE, true, 3); + PAGE_SIZE, 3, TLBF_NOWALKCACHE); } return pte; From 0477fc56960d91ac98412534188741a09e5d8a18 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:58 +0000 Subject: [PATCH 013/109] arm64: mm: More flags for __flush_tlb_range() Refactor function variants with "_nosync", "_local" and "_nonotify" into a single __always_inline implementation that takes flags and rely on constant folding to select the parts that are actually needed at any given callsite, based on the provided flags. Flags all live in the tlbf_t (TLB flags) type; TLBF_NONE (0) continues to provide the strongest semantics (i.e. evict from walk cache, broadcast, synchronise and notify). Each flag reduces the strength in some way; TLBF_NONOTIFY, TLBF_NOSYNC and TLBF_NOBROADCAST are added to complement the existing TLBF_NOWALKCACHE. There are no users that require TLBF_NOBROADCAST without TLBF_NOWALKCACHE so implement that as BUILD_BUG() to avoid needing to introduce dead code for vae1 invalidations. The result is a clearer, simpler, more powerful API. Signed-off-by: Ryan Roberts Reviewed-by: Jonathan Cameron Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 95 ++++++++++++++++++------------- arch/arm64/mm/contpte.c | 9 ++- 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index d134824ea5da..5509927e45b9 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -295,7 +295,10 @@ static inline void __tlbi_sync_s1ish_hyp(void) * no invalidation may take place. In the case where the level * cannot be easily determined, the value TLBI_TTL_UNKNOWN will * perform a non-hinted invalidation. flags may be TLBF_NONE (0) or - * TLBF_NOWALKCACHE (elide eviction of walk cache entries). + * any combination of TLBF_NOWALKCACHE (elide eviction of walk + * cache entries), TLBF_NONOTIFY (don't call mmu notifiers), + * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST + * (only perform the invalidation for the local cpu). * * local_flush_tlb_page(vma, addr) * Local variant of flush_tlb_page(). Stale TLB entries may @@ -305,12 +308,6 @@ static inline void __tlbi_sync_s1ish_hyp(void) * Same as local_flush_tlb_page() except MMU notifier will not be * called. * - * local_flush_tlb_contpte(vma, addr) - * Invalidate the virtual-address range - * '[addr, addr+CONT_PTE_SIZE)' mapped with contpte on local CPU - * for the user address space corresponding to 'vma->mm'. Stale - * TLB entries may remain in remote CPUs. - * * Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented * on top of these routines, since that is our interface to the mmu_gather * API as used by munmap() and friends. @@ -552,15 +549,23 @@ typedef unsigned __bitwise tlbf_t; /* Invalidate tlb entries only, leaving the page table walk cache intact. */ #define TLBF_NOWALKCACHE ((__force tlbf_t)BIT(0)) -static inline void __flush_tlb_range_nosync(struct mm_struct *mm, - unsigned long start, unsigned long end, - unsigned long stride, int tlb_level, - tlbf_t flags) +/* Skip the trailing dsb after issuing tlbi. */ +#define TLBF_NOSYNC ((__force tlbf_t)BIT(1)) + +/* Suppress tlb notifier callbacks for this flush operation. */ +#define TLBF_NONOTIFY ((__force tlbf_t)BIT(2)) + +/* Perform the tlbi locally without broadcasting to other CPUs. */ +#define TLBF_NOBROADCAST ((__force tlbf_t)BIT(3)) + +static __always_inline void __do_flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, int tlb_level, + tlbf_t flags) { + struct mm_struct *mm = vma->vm_mm; unsigned long asid, pages; - start = round_down(start, stride); - end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; if (__flush_tlb_range_limit_excess(pages, stride)) { @@ -568,17 +573,41 @@ static inline void __flush_tlb_range_nosync(struct mm_struct *mm, return; } - dsb(ishst); + if (!(flags & TLBF_NOBROADCAST)) + dsb(ishst); + else + dsb(nshst); + asid = ASID(mm); - if (flags & TLBF_NOWALKCACHE) - __flush_s1_tlb_range_op(vale1is, start, pages, stride, - asid, tlb_level); - else + switch (flags & (TLBF_NOWALKCACHE | TLBF_NOBROADCAST)) { + case TLBF_NONE: __flush_s1_tlb_range_op(vae1is, start, pages, stride, - asid, tlb_level); + asid, tlb_level); + break; + case TLBF_NOWALKCACHE: + __flush_s1_tlb_range_op(vale1is, start, pages, stride, + asid, tlb_level); + break; + case TLBF_NOBROADCAST: + /* Combination unused */ + BUG(); + break; + case TLBF_NOWALKCACHE | TLBF_NOBROADCAST: + __flush_s1_tlb_range_op(vale1, start, pages, stride, + asid, tlb_level); + break; + } - mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); + if (!(flags & TLBF_NONOTIFY)) + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); + + if (!(flags & TLBF_NOSYNC)) { + if (!(flags & TLBF_NOBROADCAST)) + __tlbi_sync_s1ish(); + else + dsb(nsh); + } } static inline void __flush_tlb_range(struct vm_area_struct *vma, @@ -586,24 +615,9 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long stride, int tlb_level, tlbf_t flags) { - __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, - tlb_level, flags); - __tlbi_sync_s1ish(); -} - -static inline void local_flush_tlb_contpte(struct vm_area_struct *vma, - unsigned long addr) -{ - unsigned long asid; - - addr = round_down(addr, CONT_PTE_SIZE); - - dsb(nshst); - asid = ASID(vma->vm_mm); - __flush_s1_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid, 3); - mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr, - addr + CONT_PTE_SIZE); - dsb(nsh); + start = round_down(start, stride); + end = round_up(end, stride); + __do_flush_tlb_range(vma, start, end, stride, tlb_level, flags); } static inline void flush_tlb_range(struct vm_area_struct *vma, @@ -656,7 +670,10 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm, unsigned long start, unsigned long end) { - __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, 3, TLBF_NOWALKCACHE); + struct vm_area_struct vma = { .vm_mm = mm, .vm_flags = 0 }; + + __flush_tlb_range(&vma, start, end, PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOSYNC); } static inline bool __pte_flags_need_flush(ptdesc_t oldval, ptdesc_t newval) diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 5f2a7b43ca32..3970392c4326 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -552,8 +552,8 @@ int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, * See comment in __ptep_clear_flush_young(); same rationale for * eliding the trailing DSB applies here. */ - __flush_tlb_range_nosync(vma->vm_mm, addr, end, - PAGE_SIZE, 3, TLBF_NOWALKCACHE); + __flush_tlb_range(vma, addr, end, PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOSYNC); } return young; @@ -686,7 +686,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, __ptep_set_access_flags(vma, addr, ptep, entry, 0); if (dirty) - local_flush_tlb_contpte(vma, start_addr); + __flush_tlb_range(vma, start_addr, + start_addr + CONT_PTE_SIZE, + PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOBROADCAST); } else { __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); __ptep_set_access_flags(vma, addr, ptep, entry, dirty); From 15397e3c3850d447b7167614cbb24a98192c3693 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:55:59 +0000 Subject: [PATCH 014/109] arm64: mm: Wrap flush_tlb_page() around __do_flush_tlb_range() Flushing a page from the tlb is just a special case of flushing a range. So let's rework flush_tlb_page() so that it simply wraps __do_flush_tlb_range(). While at it, let's also update the API to take the same flags that we use when flushing a range. This allows us to delete all the ugly "_nosync", "_local" and "_nonotify" variants. Thanks to constant folding, all of the complex looping and tlbi-by-range options get eliminated so that the generated code for flush_tlb_page() looks very similar to the previous version. Reviewed-by: Linu Cherian Signed-off-by: Ryan Roberts Reviewed-by: Jonathan Cameron Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 6 +-- arch/arm64/include/asm/tlbflush.h | 81 ++++++++++--------------------- arch/arm64/mm/fault.c | 2 +- 3 files changed, 29 insertions(+), 60 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 88bb9275ac89..7039931df462 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -101,10 +101,10 @@ static inline void arch_leave_lazy_mmu_mode(void) * entries exist. */ #define flush_tlb_fix_spurious_fault(vma, address, ptep) \ - local_flush_tlb_page_nonotify(vma, address) + __flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY) #define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \ - local_flush_tlb_page_nonotify(vma, address) + __flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY) /* * ZERO_PAGE is a global shared page that is always zero: used @@ -1320,7 +1320,7 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, * context-switch, which provides a DSB to complete the TLB * invalidation. */ - flush_tlb_page_nosync(vma, address); + __flush_tlb_page(vma, address, TLBF_NOSYNC); } return young; diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5509927e45b9..5096ec7ab865 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -269,10 +269,7 @@ static inline void __tlbi_sync_s1ish_hyp(void) * unmapping pages from vmalloc/io space. * * flush_tlb_page(vma, addr) - * Invalidate a single user mapping for address 'addr' in the - * address space corresponding to 'vma->mm'. Note that this - * operation only invalidates a single, last-level page-table - * entry and therefore does not affect any walk-caches. + * Equivalent to __flush_tlb_page(..., flags=TLBF_NONE) * * * Next, we have some undocumented invalidation routines that you probably @@ -300,13 +297,14 @@ static inline void __tlbi_sync_s1ish_hyp(void) * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST * (only perform the invalidation for the local cpu). * - * local_flush_tlb_page(vma, addr) - * Local variant of flush_tlb_page(). Stale TLB entries may - * remain in remote CPUs. - * - * local_flush_tlb_page_nonotify(vma, addr) - * Same as local_flush_tlb_page() except MMU notifier will not be - * called. + * __flush_tlb_page(vma, addr, flags) + * Invalidate a single user mapping for address 'addr' in the + * address space corresponding to 'vma->mm'. Note that this + * operation only invalidates a single, last-level page-table entry + * and therefore does not affect any walk-caches. flags may contain + * any combination of TLBF_NONOTIFY (don't call mmu notifiers), + * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST + * (only perform the invalidation for the local cpu). * * Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented * on top of these routines, since that is our interface to the mmu_gather @@ -340,51 +338,6 @@ static inline void flush_tlb_mm(struct mm_struct *mm) mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } -static inline void __local_flush_tlb_page_nonotify_nosync(struct mm_struct *mm, - unsigned long uaddr) -{ - dsb(nshst); - __tlbi_level_asid(vale1, uaddr, TLBI_TTL_UNKNOWN, ASID(mm)); -} - -static inline void local_flush_tlb_page_nonotify(struct vm_area_struct *vma, - unsigned long uaddr) -{ - __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr); - dsb(nsh); -} - -static inline void local_flush_tlb_page(struct vm_area_struct *vma, - unsigned long uaddr) -{ - __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr); - mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, uaddr & PAGE_MASK, - (uaddr & PAGE_MASK) + PAGE_SIZE); - dsb(nsh); -} - -static inline void __flush_tlb_page_nosync(struct mm_struct *mm, - unsigned long uaddr) -{ - dsb(ishst); - __tlbi_level_asid(vale1is, uaddr, TLBI_TTL_UNKNOWN, ASID(mm)); - mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, - (uaddr & PAGE_MASK) + PAGE_SIZE); -} - -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, - unsigned long uaddr) -{ - return __flush_tlb_page_nosync(vma->vm_mm, uaddr); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long uaddr) -{ - flush_tlb_page_nosync(vma, uaddr); - __tlbi_sync_s1ish(); -} - static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { return true; @@ -632,6 +585,22 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, __flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, TLBF_NONE); } +static inline void __flush_tlb_page(struct vm_area_struct *vma, + unsigned long uaddr, tlbf_t flags) +{ + unsigned long start = round_down(uaddr, PAGE_SIZE); + unsigned long end = start + PAGE_SIZE; + + __do_flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, + TLBF_NOWALKCACHE | flags); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long uaddr) +{ + __flush_tlb_page(vma, uaddr, TLBF_NONE); +} + static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { const unsigned long stride = PAGE_SIZE; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index be9dab2c7d6a..f91aa686f142 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -239,7 +239,7 @@ int __ptep_set_access_flags(struct vm_area_struct *vma, * flush_tlb_fix_spurious_fault(). */ if (dirty) - local_flush_tlb_page(vma, address); + __flush_tlb_page(vma, address, TLBF_NOBROADCAST); return 1; } From 752a0d1d483e9479f5c59519256fd190139d0b39 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 2 Mar 2026 13:56:00 +0000 Subject: [PATCH 015/109] arm64: mm: Provide level hint for flush_tlb_page() Previously tlb invalidations issued by __flush_tlb_page() did not contain a level hint. From the core API documentation, this function is clearly only ever intended to target level 3 (PTE) tlb entries: | 4) ``void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)`` | | This time we need to remove the PAGE_SIZE sized translation | from the TLB. However, the arm64 documentation is more relaxed allowing any last level: | this operation only invalidates a single, last-level page-table | entry and therefore does not affect any walk-caches It turns out that the function was actually being used to invalidate a level 2 mapping via flush_tlb_fix_spurious_fault_pmd(). The bug was benign because the level hint was not set so the HW would still invalidate the PMD mapping, and also because the TLBF_NONOTIFY flag was set, the bounds of the mapping were never used for anything else. Now that we have the new and improved range-invalidation API, it is trival to fix flush_tlb_fix_spurious_fault_pmd() to explicitly flush the whole range (locally, without notification and last level only). So let's do that, and then update __flush_tlb_page() to hint level 3. Reviewed-by: Linu Cherian Signed-off-by: Ryan Roberts [catalin.marinas@arm.com: use "level 3" in the __flush_tlb_page() description] [catalin.marinas@arm.com: tweak the commit message to include the core API text] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 5 +++-- arch/arm64/include/asm/tlbflush.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7039931df462..b1a96a8f2b17 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -103,8 +103,9 @@ static inline void arch_leave_lazy_mmu_mode(void) #define flush_tlb_fix_spurious_fault(vma, address, ptep) \ __flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY) -#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \ - __flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY) +#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \ + __flush_tlb_range(vma, address, address + PMD_SIZE, PMD_SIZE, 2, \ + TLBF_NOBROADCAST | TLBF_NONOTIFY | TLBF_NOWALKCACHE) /* * ZERO_PAGE is a global shared page that is always zero: used diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5096ec7ab865..47fa4d39a461 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -300,7 +300,7 @@ static inline void __tlbi_sync_s1ish_hyp(void) * __flush_tlb_page(vma, addr, flags) * Invalidate a single user mapping for address 'addr' in the * address space corresponding to 'vma->mm'. Note that this - * operation only invalidates a single, last-level page-table entry + * operation only invalidates a single level 3 page-table entry * and therefore does not affect any walk-caches. flags may contain * any combination of TLBF_NONOTIFY (don't call mmu notifiers), * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST @@ -591,7 +591,7 @@ static inline void __flush_tlb_page(struct vm_area_struct *vma, unsigned long start = round_down(uaddr, PAGE_SIZE); unsigned long end = start + PAGE_SIZE; - __do_flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, + __do_flush_tlb_range(vma, start, end, PAGE_SIZE, 3, TLBF_NOWALKCACHE | flags); } From 2615924e45a79dacc5eb27afea45a364a6b6d8bc Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 25 Feb 2026 06:40:28 +0000 Subject: [PATCH 016/109] arm64/mm: Describe TTBR1_BADDR_4852_OFFSET TTBR1_BADDR_4852_OFFSET is a constant offset which gets added into kernel page table physical address for TTBR1_EL1 when kernel is build for 52 bit VA but found to be running on 48 bit VA capable system. Although there is no explanation on how the macro is computed. Describe TTBR1_BADDR_4852_OFFSET computation in detail via deriving from all required parameters involved thus improving clarity and readability. Cc: Will Deacon Cc: Mark Rutland Cc: Ryan Roberts Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d49180bb7cb3..e80712379dd6 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -287,9 +287,12 @@ #endif #ifdef CONFIG_ARM64_VA_BITS_52 +#define PTRS_PER_PGD_52_VA (UL(1) << (52 - PGDIR_SHIFT)) +#define PTRS_PER_PGD_48_VA (UL(1) << (48 - PGDIR_SHIFT)) +#define PTRS_PER_PGD_EXTRA (PTRS_PER_PGD_52_VA - PTRS_PER_PGD_48_VA) + /* Must be at least 64-byte aligned to prevent corruption of the TTBR */ -#define TTBR1_BADDR_4852_OFFSET (((UL(1) << (52 - PGDIR_SHIFT)) - \ - (UL(1) << (48 - PGDIR_SHIFT))) * 8) +#define TTBR1_BADDR_4852_OFFSET (PTRS_PER_PGD_EXTRA << PTDESC_ORDER) #endif #endif From d989010bbecad295cc66737a30b05319f5e7507b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 2 Mar 2026 06:44:36 +0000 Subject: [PATCH 017/109] arm64/mm: Directly use TTBRx_EL1_ASID_MASK Replace all TTBR_ASID_MASK macro instances with TTBRx_EL1_ASID_MASK which is a standard field mask from tools sysreg format. Drop the now redundant custom macro TTBR_ASID_MASK. No functional change. Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kvmarm@lists.linux.dev Signed-off-by: Anshuman Khandual Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/asm-uaccess.h | 2 +- arch/arm64/include/asm/mmu.h | 1 - arch/arm64/include/asm/mmu_context.h | 3 ++- arch/arm64/include/asm/uaccess.h | 6 +++--- arch/arm64/kernel/entry.S | 2 +- arch/arm64/mm/context.c | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 9148f5a31968..12aa6a283249 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -15,7 +15,7 @@ #ifdef CONFIG_ARM64_SW_TTBR0_PAN .macro __uaccess_ttbr0_disable, tmp1 mrs \tmp1, ttbr1_el1 // swapper_pg_dir - bic \tmp1, \tmp1, #TTBR_ASID_MASK + bic \tmp1, \tmp1, #TTBRx_EL1_ASID_MASK sub \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET // reserved_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 add \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 137a173df1ff..019b36cda380 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -10,7 +10,6 @@ #define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ #define USER_ASID_BIT 48 #define USER_ASID_FLAG (UL(1) << USER_ASID_BIT) -#define TTBR_ASID_MASK (UL(0xffff) << 48) #ifndef __ASSEMBLER__ diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index cc80af59c69e..803b68758152 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -210,7 +210,8 @@ static inline void update_saved_ttbr0(struct task_struct *tsk, if (mm == &init_mm) ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); else - ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48; + ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | + FIELD_PREP(TTBRx_EL1_ASID_MASK, ASID(mm)); WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr); } diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 9810106a3f66..86dfc356ee6e 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -62,7 +62,7 @@ static inline void __uaccess_ttbr0_disable(void) local_irq_save(flags); ttbr = read_sysreg(ttbr1_el1); - ttbr &= ~TTBR_ASID_MASK; + ttbr &= ~TTBRx_EL1_ASID_MASK; /* reserved_pg_dir placed before swapper_pg_dir */ write_sysreg(ttbr - RESERVED_SWAPPER_OFFSET, ttbr0_el1); /* Set reserved ASID */ @@ -85,8 +85,8 @@ static inline void __uaccess_ttbr0_enable(void) /* Restore active ASID */ ttbr1 = read_sysreg(ttbr1_el1); - ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */ - ttbr1 |= ttbr0 & TTBR_ASID_MASK; + ttbr1 &= ~TTBRx_EL1_ASID_MASK; /* safety measure */ + ttbr1 |= ttbr0 & TTBRx_EL1_ASID_MASK; write_sysreg(ttbr1, ttbr1_el1); /* Restore user page table */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f8018b5c1f9a..9e1bcc821a16 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -473,7 +473,7 @@ alternative_else_nop_endif */ SYM_CODE_START_LOCAL(__swpan_entry_el1) mrs x21, ttbr0_el1 - tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + tst x21, #TTBRx_EL1_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index b2ac06246327..718c495832d0 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -358,11 +358,11 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm) /* SW PAN needs a copy of the ASID in TTBR0 for entry */ if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN)) - ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid); + ttbr0 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid); /* Set ASID in TTBR1 since TCR.A1 is set */ - ttbr1 &= ~TTBR_ASID_MASK; - ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid); + ttbr1 &= ~TTBRx_EL1_ASID_MASK; + ttbr1 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid); cpu_set_reserved_ttbr0_nosync(); write_sysreg(ttbr1, ttbr1_el1); From be6e9dee0e978ddf7b4b6b8b8dba37413ae1a393 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 2 Mar 2026 06:44:37 +0000 Subject: [PATCH 018/109] arm64/mm: Directly use TTBRx_EL1_CnP Replace all TTBR_CNP_BIT macro instances with TTBRx_EL1_CNP_BIT which is a standard field from tools sysreg format. Drop the now redundant custom macro TTBR_CNP_BIT. No functional change. Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kvmarm@lists.linux.dev Signed-off-by: Anshuman Khandual Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 2 -- arch/arm64/kernel/mte.c | 4 ++-- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 4 ++-- arch/arm64/mm/context.c | 2 +- arch/arm64/mm/mmu.c | 2 +- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index e80712379dd6..72f31800c703 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -223,8 +223,6 @@ */ #define S1_TABLE_AP (_AT(pmdval_t, 3) << 61) -#define TTBR_CNP_BIT (UL(1) << 0) - /* * TCR flags. */ diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 32148bf09c1d..eceead1686f2 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -315,8 +315,8 @@ void mte_cpu_setup(void) * CnP is not a boot feature so MTE gets enabled before CnP, but let's * make sure that is the case. */ - BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT); - BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT); + BUG_ON(read_sysreg(ttbr0_el1) & TTBRx_EL1_CnP); + BUG_ON(read_sysreg(ttbr1_el1) & TTBRx_EL1_CnP); /* Normal Tagged memory type at the corresponding MAIR index */ sysreg_clear_set(mair_el1, diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 0d42eedc7167..445eb0743af2 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -130,7 +130,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) ldr x1, [x0, #NVHE_INIT_PGD_PA] phys_to_ttbr x2, x1 alternative_if ARM64_HAS_CNP - orr x2, x2, #TTBR_CNP_BIT + orr x2, x2, #TTBRx_EL1_CnP alternative_else_nop_endif msr ttbr0_el2, x2 @@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd) /* Install the new pgtables */ phys_to_ttbr x5, x0 alternative_if ARM64_HAS_CNP - orr x5, x5, #TTBR_CNP_BIT + orr x5, x5, #TTBRx_EL1_CnP alternative_else_nop_endif msr ttbr0_el2, x5 diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 718c495832d0..0f4a28b87469 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -354,7 +354,7 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm) /* Skip CNP for the reserved ASID */ if (system_supports_cnp() && asid) - ttbr0 |= TTBR_CNP_BIT; + ttbr0 |= TTBRx_EL1_CnP; /* SW PAN needs a copy of the ASID in TTBR0 for entry */ if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN)) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a6a00accf4f9..c22678769c37 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -2188,7 +2188,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); if (cnp) - ttbr1 |= TTBR_CNP_BIT; + ttbr1 |= TTBRx_EL1_CnP; replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); From 74cd4e0e5399480e3fab2cd6a6cbdb17f673c335 Mon Sep 17 00:00:00 2001 From: Yifan Wu Date: Thu, 5 Mar 2026 09:36:38 +0800 Subject: [PATCH 019/109] selftests/arm64: Implement cmpbr_sigill() to hwcap test The function executes a CBEQ instruction which is valid if the CPU supports the CMPBR extension. The CBEQ branches to skip the following UDF instruction, and no SIGILL is generated. Otherwise, it will generate a SIGILL. Signed-off-by: Yifan Wu Reviewed-by: Mark Brown Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/abi/hwcap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index c2661a312fc9..e22703d6b97c 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -56,7 +56,8 @@ static void atomics_sigill(void) static void cmpbr_sigill(void) { - /* Not implemented, too complicated and unreliable anyway */ + asm volatile(".inst 0x74C00040\n" /* CBEQ w0, w0, +8 */ + "udf #0" : : : "cc"); /* UDF #0 */ } static void crc32_sigill(void) From 2e30447b233a8e7a561bb51995b8d4944282bf62 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 5 Mar 2026 16:28:17 +0000 Subject: [PATCH 020/109] KVM: arm64: Read PMUVer as unsigned ID_AA64DFR0_EL1.PMUVer is an unsigned field, so this skips initialization of host_data_ptr(nr_event_counters) for PMUv3 for Armv8.8 onwards as they appear as negative values. Fix it by reading it as unsigned. Now ID_AA64DFR0_EL1_PMUVer_IMP_DEF needs to be special cased, so use pmuv3_implemented() which already does it. Fixes: 2417218f2f23 ("KVM: arm64: Get rid of __kvm_get_mdcr_el2() and related warts") Signed-off-by: James Clark Reviewed-by: Marc Zyngier Reviewed-by: Colton Lewis Signed-off-by: Will Deacon --- arch/arm64/kvm/debug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 3ad6b7c6e4ba..f4d7b12045e8 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -75,8 +76,10 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) void kvm_init_host_debug_data(void) { u64 dfr0 = read_sysreg(id_aa64dfr0_el1); + unsigned int pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); - if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) + if (pmuv3_implemented(pmuver)) *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, read_sysreg(pmcr_el0)); From d1dcc20bcc40efe1f1c71639376c91dafa489222 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 5 Mar 2026 16:28:18 +0000 Subject: [PATCH 021/109] arm64: cpufeature: Make PMUVer and PerfMon unsigned On the host, this change doesn't make a difference because the fields are defined as FTR_EXACT. However, KVM allows userspace to set these fields for a guest and overrides the type to be FTR_LOWER_SAFE. And while KVM used to do an unsigned comparison to validate that the new value is lower than what the hardware provides, since the linked commit it uses the generic sanitization framework which does a signed comparison. Fix it by defining these fields as unsigned. In theory, without this fix, userspace could set a higher PMU version than the hardware supports by providing any value with the top bit set. Fixes: c118cead07a7 ("KVM: arm64: Use generic sanitisation for ID_(AA64)DFR0_EL1") Signed-off-by: James Clark Reviewed-by: Marc Zyngier Reviewed-by: Colton Lewis Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c31f8e17732a..07abdfd40756 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -565,7 +565,7 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { * We can instantiate multiple PMU instances with different levels * of support. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), ARM64_FTR_END, }; @@ -709,7 +709,7 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = { static const struct arm64_ftr_bits ftr_id_dfr0[] = { /* [31:28] TraceFilt */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0), From 15ed3fa23cbcb9b9db842947698c3b4b2aeb706c Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 5 Mar 2026 16:28:19 +0000 Subject: [PATCH 022/109] arm64: cpufeature: Use pmuv3_implemented() function Other places that are doing this version comparison are already using pmuv3_implemented(), so might as well use it here too for consistency. Signed-off-by: James Clark Reviewed-by: Marc Zyngier Reviewed-by: Colton Lewis Signed-off-by: Will Deacon --- arch/arm/include/asm/arm_pmuv3.h | 7 +++++++ arch/arm64/kernel/cpufeature.c | 12 ++---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index 2ec0e5e83fc9..ecfede0c0348 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -238,6 +238,13 @@ static inline void kvm_vcpu_pmu_resync_el0(void) {} static inline bool pmuv3_implemented(int pmuver) { + /* + * PMUVer follows the standard ID scheme for an unsigned field with the + * exception of 0xF (IMP_DEF) which is treated specially and implies + * FEAT_PMUv3 is not implemented. + * + * See DDI0487L.a D24.1.3.2 for more details. + */ return !(pmuver == ARMV8_PMU_DFR_VER_IMP_DEF || pmuver == ARMV8_PMU_DFR_VER_NI); } diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 07abdfd40756..cc9783b26443 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -77,6 +77,7 @@ #include #include +#include #include #include #include @@ -1927,19 +1928,10 @@ static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); unsigned int pmuver; - /* - * PMUVer follows the standard ID scheme for an unsigned field with the - * exception of 0xF (IMP_DEF) which is treated specially and implies - * FEAT_PMUv3 is not implemented. - * - * See DDI0487L.a D24.1.3.2 for more details. - */ pmuver = cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT); - if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) - return false; - return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP; + return pmuv3_implemented(pmuver); } #endif From 5394396ff5488f007248727988b722c5d4f0638b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 19 Feb 2026 17:27:53 +0000 Subject: [PATCH 023/109] perf/arm-cmn: Stop claiming entire iomem region So far, the PMU has been the only thing of interest in the vast mass of CMN registers, so we've gotten away with simply claiming the entire iomem region. However, now that we can support other features like MPAM controllers for the system caches, the PMU driver needs to stop being selfish and learn to share. Similarly to arm-ni, requesting just the DTC node(s) should suffice for staking our exclusive claim to the PMU features, as requesting hundreds of tiny regions for all the individual pmu_event_sel registers is definitely not worth the considerable bother. As a consequence, we can also streamline the annoying CMN-600 special cases even more. The ACPI binding has in fact always specified a strict order for all resources, so we can reasonably drop the ancient pretence of swapping base and cfg, which IIRC was more just a moment of doubt on my part than anything else. Cc: James Morse Signed-off-by: Robin Murphy Tested-by: Ben Horgan Reviewed-by: Ilkka Koskinen Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 70 +++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 40c05c519a1d..1ac91cda6780 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2132,6 +2132,8 @@ static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, i static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx) { struct arm_cmn_dtc *dtc = cmn->dtc + idx; + const struct resource *cfg; + resource_size_t base, size; dtc->pmu_base = dn->pmu_base; dtc->base = dtc->pmu_base - arm_cmn_pmu_offset(cmn, dn); @@ -2139,6 +2141,13 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id if (dtc->irq < 0) return dtc->irq; + cfg = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 0); + base = dtc->base - cmn->base + cfg->start; + size = cmn->part == PART_CMN600 ? SZ_16K : SZ_64K; + if (!devm_request_mem_region(cmn->dev, base, size, dev_name(cmn->dev))) + return dev_err_probe(cmn->dev, -EBUSY, + "Failed to request DTC region 0x%llx\n", base); + writel_relaxed(CMN_DT_DTC_CTL_DT_EN, dtc->base + CMN_DT_DTC_CTL); writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, CMN_DT_PMCR(dtc)); writeq_relaxed(0, CMN_DT_PMCCNTR(dtc)); @@ -2525,43 +2534,26 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) return 0; } -static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) -{ - struct resource *cfg, *root; - - cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!cfg) - return -EINVAL; - - root = platform_get_resource(pdev, IORESOURCE_MEM, 1); - if (!root) - return -EINVAL; - - if (!resource_contains(cfg, root)) - swap(cfg, root); - /* - * Note that devm_ioremap_resource() is dumb and won't let the platform - * device claim cfg when the ACPI companion device has already claimed - * root within it. But since they *are* already both claimed in the - * appropriate name, we don't really need to do it again here anyway. - */ - cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg)); - if (!cmn->base) - return -ENOMEM; - - return root->start - cfg->start; -} - -static int arm_cmn600_of_probe(struct device_node *np) +static int arm_cmn_get_root(struct arm_cmn *cmn, const struct resource *cfg) { + const struct device_node *np = cmn->dev->of_node; + const struct resource *root; u32 rootnode; - return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode; + if (cmn->part != PART_CMN600) + return 0; + + if (np) + return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode; + + root = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 1); + return root ? root->start - cfg->start : -EINVAL; } static int arm_cmn_probe(struct platform_device *pdev) { struct arm_cmn *cmn; + const struct resource *cfg; const char *name; static atomic_t id; int err, rootnode, this_id; @@ -2575,16 +2567,16 @@ static int arm_cmn_probe(struct platform_device *pdev) cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev)); platform_set_drvdata(pdev, cmn); - if (cmn->part == PART_CMN600 && has_acpi_companion(cmn->dev)) { - rootnode = arm_cmn600_acpi_probe(pdev, cmn); - } else { - rootnode = 0; - cmn->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(cmn->base)) - return PTR_ERR(cmn->base); - if (cmn->part == PART_CMN600) - rootnode = arm_cmn600_of_probe(pdev->dev.of_node); - } + cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!cfg) + return -EINVAL; + + /* Map the whole region now, claim the DTCs once we've found them */ + cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg)); + if (IS_ERR(cmn->base)) + return PTR_ERR(cmn->base); + + rootnode = arm_cmn_get_root(cmn, cfg); if (rootnode < 0) return rootnode; From d332424d1d06a9fb03ca04ba3f1092c3990125e8 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:45 +0000 Subject: [PATCH 024/109] perf/arm_cspmu: nvidia: Rename doc to Tegra241 The documentation in nvidia-pmu.rst contains PMUs specific to NVIDIA Tegra241 SoC. Rename the file for this specific SoC to have better distinction with other NVIDIA SoC. Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/index.rst | 2 +- .../perf/{nvidia-pmu.rst => nvidia-tegra241-pmu.rst} | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) rename Documentation/admin-guide/perf/{nvidia-pmu.rst => nvidia-tegra241-pmu.rst} (98%) diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 47d9a3df6329..c407bb44b08e 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -24,7 +24,7 @@ Performance monitor support thunderx2-pmu alibaba_pmu dwc_pcie_pmu - nvidia-pmu + nvidia-tegra241-pmu meson-ddr-pmu cxl ampere_cspmu diff --git a/Documentation/admin-guide/perf/nvidia-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst similarity index 98% rename from Documentation/admin-guide/perf/nvidia-pmu.rst rename to Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst index f538ef67e0e8..fad5bc4cee6c 100644 --- a/Documentation/admin-guide/perf/nvidia-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst @@ -1,8 +1,8 @@ -========================================================= -NVIDIA Tegra SoC Uncore Performance Monitoring Unit (PMU) -========================================================= +============================================================ +NVIDIA Tegra241 SoC Uncore Performance Monitoring Unit (PMU) +============================================================ -The NVIDIA Tegra SoC includes various system PMUs to measure key performance +The NVIDIA Tegra241 SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Scalable Coherency Fabric (SCF) From f5caf26fd6c71294d0fb254404ed66f8cff6f7f7 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:46 +0000 Subject: [PATCH 025/109] perf/arm_cspmu: nvidia: Add Tegra410 UCF PMU The Unified Coherence Fabric (UCF) contains last level cache and cache coherent interconnect in Tegra410 SOC. The PMU in this device can be used to capture events related to access to the last level cache and memory from different sources. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/index.rst | 1 + .../admin-guide/perf/nvidia-tegra410-pmu.rst | 106 ++++++++++++++++++ drivers/perf/arm_cspmu/nvidia_cspmu.c | 87 +++++++++++++- 3 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index c407bb44b08e..aa12708ddb96 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -25,6 +25,7 @@ Performance monitor support alibaba_pmu dwc_pcie_pmu nvidia-tegra241-pmu + nvidia-tegra410-pmu meson-ddr-pmu cxl ampere_cspmu diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst new file mode 100644 index 000000000000..7b7ba5700ca1 --- /dev/null +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -0,0 +1,106 @@ +===================================================================== +NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU) +===================================================================== + +The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance +metrics like memory bandwidth, latency, and utilization: + +* Unified Coherence Fabric (UCF) + +PMU Driver +---------- + +The PMU driver describes the available events and configuration of each PMU in +sysfs. Please see the sections below to get the sysfs path of each PMU. Like +other uncore PMU drivers, the driver provides "cpumask" sysfs attribute to show +the CPU id used to handle the PMU event. There is also "associated_cpus" +sysfs attribute, which contains a list of CPUs associated with the PMU instance. + +UCF PMU +------- + +The Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a +distributed cache, last level for CPU Memory and CXL Memory, and cache coherent +interconnect that supports hardware coherence across multiple coherently caching +agents, including: + + * CPU clusters + * GPU + * PCIe Ordering Controller Unit (OCU) + * Other IO-coherent requesters + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_ucf_pmu_. + +Some of the events available in this PMU can be used to measure bandwidth and +utilization: + + * slc_access_rd: count the number of read requests to SLC. + * slc_access_wr: count the number of write requests to SLC. + * slc_bytes_rd: count the number of bytes transferred by slc_access_rd. + * slc_bytes_wr: count the number of bytes transferred by slc_access_wr. + * mem_access_rd: count the number of read requests to local or remote memory. + * mem_access_wr: count the number of write requests to local or remote memory. + * mem_bytes_rd: count the number of bytes transferred by mem_access_rd. + * mem_bytes_wr: count the number of bytes transferred by mem_access_wr. + * cycles: counts the UCF cycles. + +The average bandwidth is calculated as:: + + AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS + AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS + AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS + AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES + AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES + AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES + AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES + +More details about what other events are available can be found in Tegra410 SoC +technical reference manual. + +The events can be filtered based on source or destination. The source filter +indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or +remote socket. The destination filter specifies the destination memory type, +e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The +local/remote classification of the destination filter is based on the home +socket of the address, not where the data actually resides. The available +filters are described in +/sys/bus/event_source/devices/nvidia_ucf_pmu_/format/. + +The list of UCF PMU event filters: + +* Source filter: + + * src_loc_cpu: if set, count events from local CPU + * src_loc_noncpu: if set, count events from local non-CPU device + * src_rem: if set, count events from CPU, GPU, PCIE devices of remote socket + +* Destination filter: + + * dst_loc_cmem: if set, count events to local system memory (CMEM) address + * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address + * dst_loc_other: if set, count events to local CXL memory address + * dst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket + +If the source is not specified, the PMU will count events from all sources. If +the destination is not specified, the PMU will count events to all destinations. + +Example usage: + +* Count event id 0x0 in socket 0 from all sources and to all destinations:: + + perf stat -a -e nvidia_ucf_pmu_0/event=0x0/ + +* Count event id 0x0 in socket 0 with source filter = local CPU and destination + filter = local system memory (CMEM):: + + perf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/ + +* Count event id 0x0 in socket 1 with source filter = local non-CPU device and + destination filter = remote memory:: + + perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index e06a06d3407b..8e37cbe3bae9 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -21,6 +21,13 @@ #define NV_CNVL_PORT_COUNT 4ULL #define NV_CNVL_FILTER_ID_MASK GENMASK_ULL(NV_CNVL_PORT_COUNT - 1, 0) +#define NV_UCF_SRC_COUNT 3ULL +#define NV_UCF_DST_COUNT 4ULL +#define NV_UCF_FILTER_ID_MASK GENMASK_ULL(11, 0) +#define NV_UCF_FILTER_SRC GENMASK_ULL(2, 0) +#define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) +#define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -124,6 +131,36 @@ static struct attribute *mcf_pmu_event_attrs[] = { NULL, }; +static struct attribute *ucf_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(bus_cycles, 0x1D), + + ARM_CSPMU_EVENT_ATTR(slc_allocate, 0xF0), + ARM_CSPMU_EVENT_ATTR(slc_wb, 0xF3), + ARM_CSPMU_EVENT_ATTR(slc_refill_rd, 0x109), + ARM_CSPMU_EVENT_ATTR(slc_refill_wr, 0x10A), + ARM_CSPMU_EVENT_ATTR(slc_hit_rd, 0x119), + + ARM_CSPMU_EVENT_ATTR(slc_access_dataless, 0x183), + ARM_CSPMU_EVENT_ATTR(slc_access_atomic, 0x184), + + ARM_CSPMU_EVENT_ATTR(slc_access_rd, 0x111), + ARM_CSPMU_EVENT_ATTR(slc_access_wr, 0x112), + ARM_CSPMU_EVENT_ATTR(slc_bytes_rd, 0x113), + ARM_CSPMU_EVENT_ATTR(slc_bytes_wr, 0x114), + + ARM_CSPMU_EVENT_ATTR(mem_access_rd, 0x121), + ARM_CSPMU_EVENT_ATTR(mem_access_wr, 0x122), + ARM_CSPMU_EVENT_ATTR(mem_bytes_rd, 0x123), + ARM_CSPMU_EVENT_ATTR(mem_bytes_wr, 0x124), + + ARM_CSPMU_EVENT_ATTR(local_snoop, 0x180), + ARM_CSPMU_EVENT_ATTR(ext_snp_access, 0x181), + ARM_CSPMU_EVENT_ATTR(ext_snp_evict, 0x182), + + ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -152,6 +189,18 @@ static struct attribute *cnvlink_pmu_format_attrs[] = { NULL, }; +static struct attribute *ucf_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_EVENT_ATTR, + ARM_CSPMU_FORMAT_ATTR(src_loc_noncpu, "config1:0"), + ARM_CSPMU_FORMAT_ATTR(src_loc_cpu, "config1:1"), + ARM_CSPMU_FORMAT_ATTR(src_rem, "config1:2"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config1:8"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config1:9"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_other, "config1:10"), + ARM_CSPMU_FORMAT_ATTR(dst_rem, "config1:11"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -236,6 +285,27 @@ static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, writel(filter, cspmu->base0 + PMCCFILTR); } +static u32 ucf_pmu_event_filter(const struct perf_event *event) +{ + u32 ret, filter, src, dst; + + filter = nv_cspmu_event_filter(event); + + /* Monitor all sources if none is selected. */ + src = FIELD_GET(NV_UCF_FILTER_SRC, filter); + if (src == 0) + src = GENMASK_ULL(NV_UCF_SRC_COUNT - 1, 0); + + /* Monitor all destinations if none is selected. */ + dst = FIELD_GET(NV_UCF_FILTER_DST, filter); + if (dst == 0) + dst = GENMASK_ULL(NV_UCF_DST_COUNT - 1, 0); + + ret = FIELD_PREP(NV_UCF_FILTER_SRC, src); + ret |= FIELD_PREP(NV_UCF_FILTER_DST, dst); + + return ret; +} enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, @@ -342,6 +412,21 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .init_data = NULL }, }, + { + .prodid = 0x2CF20000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_ucf_pmu_%u", + .name_fmt = NAME_FMT_SOCKET, + .template_ctx = { + .event_attr = ucf_pmu_event_attrs, + .format_attr = ucf_pmu_format_attrs, + .filter_mask = NV_UCF_FILTER_ID_MASK, + .filter_default_val = NV_UCF_FILTER_DEFAULT, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = ucf_pmu_event_filter, + }, + }, { .prodid = 0, .prodid_mask = 0, From bc86281fe4bd5d4a78be2f370e8319c9517e40ff Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:47 +0000 Subject: [PATCH 026/109] perf/arm_cspmu: Add arm_cspmu_acpi_dev_get Add interface to get ACPI device associated with the PMU. This ACPI device may contain additional properties not covered by the standard properties. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon --- drivers/perf/arm_cspmu/arm_cspmu.c | 19 ++++++++++++++++++- drivers/perf/arm_cspmu/arm_cspmu.h | 17 ++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 34430b68f602..49e8a1f38131 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -16,7 +16,7 @@ * The user should refer to the vendor technical documentation to get details * about the supported events. * - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -1132,6 +1132,23 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) return 0; } + +struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) +{ + char hid[16] = {}; + char uid[16] = {}; + const struct acpi_apmt_node *apmt_node; + + apmt_node = arm_cspmu_apmt_node(cspmu->dev); + if (!apmt_node || apmt_node->type != ACPI_APMT_NODE_TYPE_ACPI) + return NULL; + + memcpy(hid, &apmt_node->inst_primary, sizeof(apmt_node->inst_primary)); + snprintf(uid, sizeof(uid), "%u", apmt_node->inst_secondary); + + return acpi_dev_get_first_match_dev(hid, uid, -1); +} +EXPORT_SYMBOL_GPL(arm_cspmu_acpi_dev_get); #else static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) { diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index cd65a58dbd88..3fc5c8d77266 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 * * ARM CoreSight Architecture PMU driver. - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ #ifndef __ARM_CSPMU_H__ #define __ARM_CSPMU_H__ +#include #include #include #include @@ -255,4 +256,18 @@ int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match); /* Unregister vendor backend. */ void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match); +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +/** + * Get ACPI device associated with the PMU. + * The caller is responsible for calling acpi_dev_put() on the returned device. + */ +struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu); +#else +static inline struct acpi_device * +arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) +{ + return NULL; +} +#endif + #endif /* __ARM_CSPMU_H__ */ From bf585ba14726788335c640512d11186dab573612 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:48 +0000 Subject: [PATCH 027/109] perf/arm_cspmu: nvidia: Add Tegra410 PCIE PMU Adds PCIE PMU support in Tegra410 SOC. This PMU is instanced in each root complex in the SOC and can capture traffic from PCIE device to various memory types. This PMU can filter traffic based on the originating root port or BDF and the target memory types (CPU DRAM, GPU Memory, CXL Memory, or remote Memory). Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 163 ++++++++++++++ drivers/perf/arm_cspmu/nvidia_cspmu.c | 210 +++++++++++++++++- 2 files changed, 368 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 7b7ba5700ca1..b8cfbb80be1c 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -6,6 +6,7 @@ The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) +* PCIE PMU Driver ---------- @@ -104,3 +105,165 @@ Example usage: destination filter = remote memory:: perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ + +PCIE PMU +-------- + +This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and +the memory subsystem. It monitors all read/write traffic from the root port(s) +or a particular BDF in a PCIE RC to local or remote memory. There is one PMU per +PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into +up to 8 root ports. The traffic from each root port can be filtered using RP or +BDF filter. For example, specifying "src_rp_mask=0xFF" means the PMU counter will +capture traffic from all RPs. Please see below for more details. + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_pcie_pmu__rc_. + +The events in this PMU can be used to measure bandwidth, utilization, and +latency: + + * rd_req: count the number of read requests by PCIE device. + * wr_req: count the number of write requests by PCIE device. + * rd_bytes: count the number of bytes transferred by rd_req. + * wr_bytes: count the number of bytes transferred by wr_req. + * rd_cum_outs: count outstanding rd_req each cycle. + * cycles: count the clock cycles of SOC fabric connected to the PCIE interface. + +The average bandwidth is calculated as:: + + AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS + AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_RD_REQUEST_RATE = RD_REQ / CYCLES + AVG_WR_REQUEST_RATE = WR_REQ / CYCLES + + +The average latency is calculated as:: + + FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ + AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ + +The PMU events can be filtered based on the traffic source and destination. +The source filter indicates the PCIE devices that will be monitored. The +destination filter specifies the destination memory type, e.g. local system +memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote +classification of the destination filter is based on the home socket of the +address, not where the data actually resides. These filters can be found in +/sys/bus/event_source/devices/nvidia_pcie_pmu__rc_/format/. + +The list of event filters: + +* Source filter: + + * src_rp_mask: bitmask of root ports that will be monitored. Each bit in this + bitmask represents the RP index in the RC. If the bit is set, all devices under + the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor + devices in root port 0 to 3. + * src_bdf: the BDF that will be monitored. This is a 16-bit value that + follows formula: (bus << 8) + (device << 3) + (function). For example, the + value of BDF 27:01.1 is 0x2781. + * src_bdf_en: enable the BDF filter. If this is set, the BDF filter value in + "src_bdf" is used to filter the traffic. + + Note that Root-Port and BDF filters are mutually exclusive and the PMU in + each RC can only have one BDF filter for the whole counters. If BDF filter + is enabled, the BDF filter value will be applied to all events. + +* Destination filter: + + * dst_loc_cmem: if set, count events to local system memory (CMEM) address + * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address + * dst_loc_pcie_p2p: if set, count events to local PCIE peer address + * dst_loc_pcie_cxl: if set, count events to local CXL memory address + * dst_rem: if set, count events to remote memory address + +If the source filter is not specified, the PMU will count events from all root +ports. If the destination filter is not specified, the PMU will count events +to all destinations. + +Example usage: + +* Count event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/ + +* Count event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and + targeting just local CMEM of socket 0:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/ + +* Count event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and + targeting just local CMEM of socket 1:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ + +Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA +Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space +for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register +contains the following information to map PCIE devices under the RP back to its RC# : + + - Bus# (byte 0xc) : bus number as reported by the lspci output + - Segment# (byte 0xd) : segment number as reported by the lspci output + - RP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability + - RC# (byte 0xf): root complex number associated with the RP + - Socket# (byte 0x10): socket number associated with the RP + +Example script for mapping lspci BDF to RC# and socket#:: + + #!/bin/bash + while read bdf rest; do + dvsec4_reg=$(lspci -vv -s $bdf | awk ' + /Designated Vendor-Specific: Vendor=10de ID=0004/ { + match($0, /\[([0-9a-fA-F]+)/, arr); + print "0x" arr[1]; + exit + } + ') + if [ -n "$dvsec4_reg" ]; then + bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b) + segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b) + rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b) + rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b) + socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b) + echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket" + fi + done < <(lspci -d 10de:) + +Example output:: + + 0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00 + 0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00 + 0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00 + 0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00 + 0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00 + 0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00 + 0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00 + 0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00 + 0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00 + 0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00 + 0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00 + 0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01 + 000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01 + 000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01 + 000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01 + 000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01 + 000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01 + 000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01 + 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 + 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 + 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 8e37cbe3bae9..61fde84ea343 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -8,6 +8,7 @@ #include #include +#include #include #include "arm_cspmu.h" @@ -28,6 +29,19 @@ #define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) #define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) +#define NV_PCIE_V2_PORT_COUNT 8ULL +#define NV_PCIE_V2_FILTER_ID_MASK GENMASK_ULL(24, 0) +#define NV_PCIE_V2_FILTER_PORT GENMASK_ULL(NV_PCIE_V2_PORT_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER_BDF_VAL GENMASK_ULL(23, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_BDF_EN BIT(24) +#define NV_PCIE_V2_FILTER_BDF_VAL_EN GENMASK_ULL(24, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_DEFAULT NV_PCIE_V2_FILTER_PORT + +#define NV_PCIE_V2_DST_COUNT 5ULL +#define NV_PCIE_V2_FILTER2_ID_MASK GENMASK_ULL(4, 0) +#define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -161,6 +175,16 @@ static struct attribute *ucf_pmu_event_attrs[] = { NULL }; +static struct attribute *pcie_v2_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), + ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), + ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), + ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), + ARM_CSPMU_EVENT_ATTR(rd_cum_outs, 0x4), + ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -201,6 +225,19 @@ static struct attribute *ucf_pmu_format_attrs[] = { NULL }; +static struct attribute *pcie_v2_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_EVENT_ATTR, + ARM_CSPMU_FORMAT_ATTR(src_rp_mask, "config1:0-7"), + ARM_CSPMU_FORMAT_ATTR(src_bdf, "config1:8-23"), + ARM_CSPMU_FORMAT_ATTR(src_bdf_en, "config1:24"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config2:0"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config2:1"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_p2p, "config2:2"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_cxl, "config2:3"), + ARM_CSPMU_FORMAT_ATTR(dst_rem, "config2:4"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -232,6 +269,32 @@ nv_cspmu_get_name(const struct arm_cspmu *cspmu) return ctx->name; } +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + struct fwnode_handle *fwnode; + struct acpi_device *adev; + int ret; + + adev = arm_cspmu_acpi_dev_get(cspmu); + if (!adev) + return -ENODEV; + + fwnode = acpi_fwnode_handle(adev); + ret = fwnode_property_read_u32(fwnode, "instance_id", id); + if (ret) + dev_err(cspmu->dev, "Failed to get instance ID\n"); + + acpi_dev_put(adev); + return ret; +} +#else +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + return -EINVAL; +} +#endif + static u32 nv_cspmu_event_filter(const struct perf_event *event) { const struct nv_cspmu_ctx *ctx = @@ -277,6 +340,20 @@ static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, } } +static void nv_cspmu_reset_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + const u32 offset = 4 * event->hw.idx; + + if (ctx->get_filter) + writel(0, cspmu->base0 + PMEVFILTR + offset); + + if (ctx->get_filter2) + writel(0, cspmu->base0 + PMEVFILT2R + offset); +} + static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, const struct perf_event *event) { @@ -307,9 +384,103 @@ static u32 ucf_pmu_event_filter(const struct perf_event *event) return ret; } +static u32 pcie_v2_pmu_bdf_val_en(u32 filter) +{ + const u32 bdf_en = FIELD_GET(NV_PCIE_V2_FILTER_BDF_EN, filter); + + /* Returns both BDF value and enable bit if BDF filtering is enabled. */ + if (bdf_en) + return FIELD_GET(NV_PCIE_V2_FILTER_BDF_VAL_EN, filter); + + /* Ignore the BDF value if BDF filter is not enabled. */ + return 0; +} + +static u32 pcie_v2_pmu_event_filter(const struct perf_event *event) +{ + u32 filter, lead_filter, lead_bdf; + struct perf_event *leader; + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + + filter = event->attr.config1 & ctx->filter_mask; + if (filter != 0) + return filter; + + leader = event->group_leader; + + /* Use leader's filter value if its BDF filtering is enabled. */ + if (event != leader) { + lead_filter = pcie_v2_pmu_event_filter(leader); + lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + if (lead_bdf != 0) + return lead_filter; + } + + /* Otherwise, return default filter value. */ + return ctx->filter_default_val; +} + +static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, + struct perf_event *new_ev) +{ + /* + * Make sure the events are using same BDF filter since the PCIE-SRC PMU + * only supports one common BDF filter setting for all of the counters. + */ + + int idx; + u32 new_filter, new_rp, new_bdf, new_lead_filter, new_lead_bdf; + struct perf_event *new_leader; + + if (cspmu->impl.ops.is_cycle_counter_event(new_ev)) + return 0; + + new_leader = new_ev->group_leader; + + new_filter = pcie_v2_pmu_event_filter(new_ev); + new_lead_filter = pcie_v2_pmu_event_filter(new_leader); + + new_bdf = pcie_v2_pmu_bdf_val_en(new_filter); + new_lead_bdf = pcie_v2_pmu_bdf_val_en(new_lead_filter); + + new_rp = FIELD_GET(NV_PCIE_V2_FILTER_PORT, new_filter); + + if (new_rp != 0 && new_bdf != 0) { + dev_err(cspmu->dev, + "RP and BDF filtering are mutually exclusive\n"); + return -EINVAL; + } + + if (new_bdf != new_lead_bdf) { + dev_err(cspmu->dev, + "sibling and leader BDF value should be equal\n"); + return -EINVAL; + } + + /* Compare BDF filter on existing events. */ + idx = find_first_bit(cspmu->hw_events.used_ctrs, + cspmu->cycle_counter_logical_idx); + + if (idx != cspmu->cycle_counter_logical_idx) { + struct perf_event *leader = cspmu->hw_events.events[idx]->group_leader; + + const u32 lead_filter = pcie_v2_pmu_event_filter(leader); + const u32 lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + + if (new_lead_bdf != lead_bdf) { + dev_err(cspmu->dev, "only one BDF value is supported\n"); + return -EINVAL; + } + } + + return 0; +} + enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, - NAME_FMT_SOCKET + NAME_FMT_SOCKET, + NAME_FMT_SOCKET_INST, }; struct nv_cspmu_match { @@ -427,6 +598,26 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .get_filter = ucf_pmu_event_filter, }, }, + { + .prodid = 0x10301000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_pcie_pmu_%u_rc_%u", + .name_fmt = NAME_FMT_SOCKET_INST, + .template_ctx = { + .event_attr = pcie_v2_pmu_event_attrs, + .format_attr = pcie_v2_pmu_format_attrs, + .filter_mask = NV_PCIE_V2_FILTER_ID_MASK, + .filter_default_val = NV_PCIE_V2_FILTER_DEFAULT, + .filter2_mask = NV_PCIE_V2_FILTER2_ID_MASK, + .filter2_default_val = NV_PCIE_V2_FILTER2_DEFAULT, + .get_filter = pcie_v2_pmu_event_filter, + .get_filter2 = nv_cspmu_event_filter2, + }, + .ops = { + .validate_event = pcie_v2_pmu_validate_event, + .reset_ev_filter = nv_cspmu_reset_ev_filter, + } + }, { .prodid = 0, .prodid_mask = 0, @@ -450,7 +641,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, const struct nv_cspmu_match *match) { - char *name; + char *name = NULL; struct device *dev = cspmu->dev; static atomic_t pmu_generic_idx = {0}; @@ -464,13 +655,20 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, socket); break; } + case NAME_FMT_SOCKET_INST: { + const int cpu = cpumask_first(&cspmu->associated_cpus); + const int socket = cpu_to_node(cpu); + u32 inst_id; + + if (!nv_cspmu_get_inst_id(cspmu, &inst_id)) + name = devm_kasprintf(dev, GFP_KERNEL, + match->name_pattern, socket, inst_id); + break; + } case NAME_FMT_GENERIC: name = devm_kasprintf(dev, GFP_KERNEL, match->name_pattern, atomic_fetch_inc(&pmu_generic_idx)); break; - default: - name = NULL; - break; } return name; @@ -511,8 +709,10 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ + SET_OP(validate_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); + SET_OP(reset_ev_filter, impl_ops, match, NULL); SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs); SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs); SET_OP(get_name, impl_ops, match, nv_cspmu_get_name); From 3dd73022306bfdb29b1c33cb106fe337f46a6105 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:49 +0000 Subject: [PATCH 028/109] perf/arm_cspmu: nvidia: Add Tegra410 PCIE-TGT PMU Adds PCIE-TGT PMU support in Tegra410 SOC. This PMU is instanced in each root complex in the SOC and it captures traffic originating from any source towards PCIE BAR and CXL HDM range. The traffic can be filtered based on the destination root port or target address range. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 77 +++++ drivers/perf/arm_cspmu/nvidia_cspmu.c | 321 ++++++++++++++++++ 2 files changed, 398 insertions(+) diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index b8cfbb80be1c..c065764d41fe 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -7,6 +7,7 @@ metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) * PCIE +* PCIE-TGT PMU Driver ---------- @@ -212,6 +213,11 @@ Example usage: perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ +.. _NVIDIA_T410_PCIE_PMU_RC_Mapping_Section: + +Mapping the RC# to lspci segment number +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register @@ -267,3 +273,74 @@ Example output:: 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 + +PCIE-TGT PMU +------------ + +This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and +the memory subsystem. It monitors traffic targeting PCIE BAR and CXL HDM ranges. +There is one PCIE-TGT PMU per PCIE RC in the SoC. Each RC in Tegra410 SoC can +have up to 16 lanes that can be bifurcated into up to 8 root ports (RP). The PMU +provides RP filter to count PCIE BAR traffic to each RP and address filter to +count access to PCIE BAR or CXL HDM ranges. The details of the filters are +described in the following sections. + +Mapping the RC# to lspci segment number is similar to the PCIE PMU. Please see +:ref:`NVIDIA_T410_PCIE_PMU_RC_Mapping_Section` for more info. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_. + +The events in this PMU can be used to measure bandwidth and utilization: + + * rd_req: count the number of read requests to PCIE. + * wr_req: count the number of write requests to PCIE. + * rd_bytes: count the number of bytes transferred by rd_req. + * wr_bytes: count the number of bytes transferred by wr_req. + * cycles: count the clock cycles of SOC fabric connected to the PCIE interface. + +The average bandwidth is calculated as:: + + AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS + AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_RD_REQUEST_RATE = RD_REQ / CYCLES + AVG_WR_REQUEST_RATE = WR_REQ / CYCLES + +The PMU events can be filtered based on the destination root port or target +address range. Filtering based on RP is only available for PCIE BAR traffic. +Address filter works for both PCIE BAR and CXL HDM ranges. These filters can be +found in sysfs, see +/sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_/format/. + +Destination filter settings: + +* dst_rp_mask: bitmask to select the root port(s) to monitor. E.g. "dst_rp_mask=0xFF" + corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is + only available for PCIE BAR traffic. +* dst_addr_base: BAR or CXL HDM filter base address. +* dst_addr_mask: BAR or CXL HDM filter address mask. +* dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the + address range specified by "dst_addr_base" and "dst_addr_mask" will be used to filter + the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison + to determine if the traffic destination address falls within the filter range:: + + (txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask) + + If the comparison succeeds, then the event will be counted. + +If the destination filter is not specified, the RP filter will be configured by default +to count PCIE BAR traffic to all root ports. + +Example usage: + +* Count event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0:: + + perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/ + +* Count event id 0x1 for accesses to PCIE BAR or CXL HDM address range + 0x10000 to 0x100FF on socket 0's PCIE RC-1:: + + perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/ diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 61fde84ea343..bac83e424d6d 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -42,6 +42,24 @@ #define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) #define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST +#define NV_PCIE_TGT_PORT_COUNT 8ULL +#define NV_PCIE_TGT_EV_TYPE_CC 0x4 +#define NV_PCIE_TGT_EV_TYPE_COUNT 3ULL +#define NV_PCIE_TGT_EV_TYPE_MASK GENMASK_ULL(NV_PCIE_TGT_EV_TYPE_COUNT - 1, 0) +#define NV_PCIE_TGT_FILTER2_MASK GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT, 0) +#define NV_PCIE_TGT_FILTER2_PORT GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT - 1, 0) +#define NV_PCIE_TGT_FILTER2_ADDR_EN BIT(NV_PCIE_TGT_PORT_COUNT) +#define NV_PCIE_TGT_FILTER2_ADDR GENMASK_ULL(15, NV_PCIE_TGT_PORT_COUNT) +#define NV_PCIE_TGT_FILTER2_DEFAULT NV_PCIE_TGT_FILTER2_PORT + +#define NV_PCIE_TGT_ADDR_COUNT 8ULL +#define NV_PCIE_TGT_ADDR_STRIDE 20 +#define NV_PCIE_TGT_ADDR_CTRL 0xD38 +#define NV_PCIE_TGT_ADDR_BASE_LO 0xD3C +#define NV_PCIE_TGT_ADDR_BASE_HI 0xD40 +#define NV_PCIE_TGT_ADDR_MASK_LO 0xD44 +#define NV_PCIE_TGT_ADDR_MASK_HI 0xD48 + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -185,6 +203,15 @@ static struct attribute *pcie_v2_pmu_event_attrs[] = { NULL }; +static struct attribute *pcie_tgt_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), + ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), + ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), + ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), + ARM_CSPMU_EVENT_ATTR(cycles, NV_PCIE_TGT_EV_TYPE_CC), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -238,6 +265,15 @@ static struct attribute *pcie_v2_pmu_format_attrs[] = { NULL }; +static struct attribute *pcie_tgt_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_ATTR(event, "config:0-2"), + ARM_CSPMU_FORMAT_ATTR(dst_rp_mask, "config:3-10"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_en, "config:11"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_base, "config1:0-63"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_mask, "config2:0-63"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -477,6 +513,267 @@ static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, return 0; } +struct pcie_tgt_addr_filter { + u32 refcount; + u64 base; + u64 mask; +}; + +struct pcie_tgt_data { + struct pcie_tgt_addr_filter addr_filter[NV_PCIE_TGT_ADDR_COUNT]; + void __iomem *addr_filter_reg; +}; + +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +static int pcie_tgt_init_data(struct arm_cspmu *cspmu) +{ + int ret; + struct acpi_device *adev; + struct pcie_tgt_data *data; + struct list_head resource_list; + struct resource_entry *rentry; + struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); + struct device *dev = cspmu->dev; + + data = devm_kzalloc(dev, sizeof(struct pcie_tgt_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + adev = arm_cspmu_acpi_dev_get(cspmu); + if (!adev) { + dev_err(dev, "failed to get associated PCIE-TGT device\n"); + return -ENODEV; + } + + INIT_LIST_HEAD(&resource_list); + ret = acpi_dev_get_memory_resources(adev, &resource_list); + if (ret < 0) { + dev_err(dev, "failed to get PCIE-TGT device memory resources\n"); + acpi_dev_put(adev); + return ret; + } + + rentry = list_first_entry_or_null( + &resource_list, struct resource_entry, node); + if (rentry) { + data->addr_filter_reg = devm_ioremap_resource(dev, rentry->res); + ret = 0; + } + + if (IS_ERR(data->addr_filter_reg)) { + dev_err(dev, "failed to get address filter resource\n"); + ret = PTR_ERR(data->addr_filter_reg); + } + + acpi_dev_free_resource_list(&resource_list); + acpi_dev_put(adev); + + ctx->data = data; + + return ret; +} +#else +static int pcie_tgt_init_data(struct arm_cspmu *cspmu) +{ + return -ENODEV; +} +#endif + +static struct pcie_tgt_data *pcie_tgt_get_data(struct arm_cspmu *cspmu) +{ + struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); + + return ctx->data; +} + +/* Find the first available address filter slot. */ +static int pcie_tgt_find_addr_idx(struct arm_cspmu *cspmu, u64 base, u64 mask, + bool is_reset) +{ + int i; + struct pcie_tgt_data *data = pcie_tgt_get_data(cspmu); + + for (i = 0; i < NV_PCIE_TGT_ADDR_COUNT; i++) { + if (!is_reset && data->addr_filter[i].refcount == 0) + return i; + + if (data->addr_filter[i].base == base && + data->addr_filter[i].mask == mask) + return i; + } + + return -ENODEV; +} + +static u32 pcie_tgt_pmu_event_filter(const struct perf_event *event) +{ + u32 filter; + + filter = (event->attr.config >> NV_PCIE_TGT_EV_TYPE_COUNT) & + NV_PCIE_TGT_FILTER2_MASK; + + return filter; +} + +static bool pcie_tgt_pmu_addr_en(const struct perf_event *event) +{ + u32 filter = pcie_tgt_pmu_event_filter(event); + + return FIELD_GET(NV_PCIE_TGT_FILTER2_ADDR_EN, filter) != 0; +} + +static u32 pcie_tgt_pmu_port_filter(const struct perf_event *event) +{ + u32 filter = pcie_tgt_pmu_event_filter(event); + + return FIELD_GET(NV_PCIE_TGT_FILTER2_PORT, filter); +} + +static u64 pcie_tgt_pmu_dst_addr_base(const struct perf_event *event) +{ + return event->attr.config1; +} + +static u64 pcie_tgt_pmu_dst_addr_mask(const struct perf_event *event) +{ + return event->attr.config2; +} + +static int pcie_tgt_pmu_validate_event(struct arm_cspmu *cspmu, + struct perf_event *new_ev) +{ + u64 base, mask; + int idx; + + if (!pcie_tgt_pmu_addr_en(new_ev)) + return 0; + + /* Make sure there is a slot available for the address filter. */ + base = pcie_tgt_pmu_dst_addr_base(new_ev); + mask = pcie_tgt_pmu_dst_addr_mask(new_ev); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); + if (idx < 0) + return -EINVAL; + + return 0; +} + +static void pcie_tgt_pmu_config_addr_filter(struct arm_cspmu *cspmu, + bool en, u64 base, u64 mask, int idx) +{ + struct pcie_tgt_data *data; + struct pcie_tgt_addr_filter *filter; + void __iomem *filter_reg; + + data = pcie_tgt_get_data(cspmu); + filter = &data->addr_filter[idx]; + filter_reg = data->addr_filter_reg + (idx * NV_PCIE_TGT_ADDR_STRIDE); + + if (en) { + filter->refcount++; + if (filter->refcount == 1) { + filter->base = base; + filter->mask = mask; + + writel(lower_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); + writel(upper_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); + writel(lower_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); + writel(upper_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); + writel(1, filter_reg + NV_PCIE_TGT_ADDR_CTRL); + } + } else { + filter->refcount--; + if (filter->refcount == 0) { + writel(0, filter_reg + NV_PCIE_TGT_ADDR_CTRL); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); + + filter->base = 0; + filter->mask = 0; + } + } +} + +static void pcie_tgt_pmu_set_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + bool addr_filter_en; + int idx; + u32 filter2_val, filter2_offset, port_filter; + u64 base, mask; + + filter2_val = 0; + filter2_offset = PMEVFILT2R + (4 * event->hw.idx); + + addr_filter_en = pcie_tgt_pmu_addr_en(event); + if (addr_filter_en) { + base = pcie_tgt_pmu_dst_addr_base(event); + mask = pcie_tgt_pmu_dst_addr_mask(event); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); + + if (idx < 0) { + dev_err(cspmu->dev, + "Unable to find a slot for address filtering\n"); + writel(0, cspmu->base0 + filter2_offset); + return; + } + + /* Configure address range filter registers.*/ + pcie_tgt_pmu_config_addr_filter(cspmu, true, base, mask, idx); + + /* Config the counter to use the selected address filter slot. */ + filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_ADDR, 1U << idx); + } + + port_filter = pcie_tgt_pmu_port_filter(event); + + /* Monitor all ports if no filter is selected. */ + if (!addr_filter_en && port_filter == 0) + port_filter = NV_PCIE_TGT_FILTER2_PORT; + + filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_PORT, port_filter); + + writel(filter2_val, cspmu->base0 + filter2_offset); +} + +static void pcie_tgt_pmu_reset_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + bool addr_filter_en; + u64 base, mask; + int idx; + + addr_filter_en = pcie_tgt_pmu_addr_en(event); + if (!addr_filter_en) + return; + + base = pcie_tgt_pmu_dst_addr_base(event); + mask = pcie_tgt_pmu_dst_addr_mask(event); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, true); + + if (idx < 0) { + dev_err(cspmu->dev, + "Unable to find the address filter slot to reset\n"); + return; + } + + pcie_tgt_pmu_config_addr_filter(cspmu, false, base, mask, idx); +} + +static u32 pcie_tgt_pmu_event_type(const struct perf_event *event) +{ + return event->attr.config & NV_PCIE_TGT_EV_TYPE_MASK; +} + +static bool pcie_tgt_pmu_is_cycle_counter_event(const struct perf_event *event) +{ + u32 event_type = pcie_tgt_pmu_event_type(event); + + return event_type == NV_PCIE_TGT_EV_TYPE_CC; +} + enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, NAME_FMT_SOCKET, @@ -618,6 +915,28 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .reset_ev_filter = nv_cspmu_reset_ev_filter, } }, + { + .prodid = 0x10700000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_pcie_tgt_pmu_%u_rc_%u", + .name_fmt = NAME_FMT_SOCKET_INST, + .template_ctx = { + .event_attr = pcie_tgt_pmu_event_attrs, + .format_attr = pcie_tgt_pmu_format_attrs, + .filter_mask = 0x0, + .filter_default_val = 0x0, + .filter2_mask = NV_PCIE_TGT_FILTER2_MASK, + .filter2_default_val = NV_PCIE_TGT_FILTER2_DEFAULT, + .init_data = pcie_tgt_init_data + }, + .ops = { + .is_cycle_counter_event = pcie_tgt_pmu_is_cycle_counter_event, + .event_type = pcie_tgt_pmu_event_type, + .validate_event = pcie_tgt_pmu_validate_event, + .set_ev_filter = pcie_tgt_pmu_set_ev_filter, + .reset_ev_filter = pcie_tgt_pmu_reset_ev_filter, + } + }, { .prodid = 0, .prodid_mask = 0, @@ -710,6 +1029,8 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) /* NVIDIA specific callbacks. */ SET_OP(validate_event, impl_ops, match, NULL); + SET_OP(event_type, impl_ops, match, NULL); + SET_OP(is_cycle_counter_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); SET_OP(reset_ev_filter, impl_ops, match, NULL); From 429b7638b2df5538e945aaa2cc189cf0d6e8fb3a Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:50 +0000 Subject: [PATCH 029/109] perf: add NVIDIA Tegra410 CPU Memory Latency PMU Adds CPU Memory (CMEM) Latency PMU support in Tegra410 SOC. The PMU is used to measure latency between the edge of the Unified Coherence Fabric to the local system DRAM. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 25 + drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/nvidia_t410_cmem_latency_pmu.c | 736 ++++++++++++++++++ 4 files changed, 769 insertions(+) create mode 100644 drivers/perf/nvidia_t410_cmem_latency_pmu.c diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index c065764d41fe..9945c43f6a7a 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -8,6 +8,7 @@ metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) * PCIE * PCIE-TGT +* CPU Memory (CMEM) Latency PMU Driver ---------- @@ -344,3 +345,27 @@ Example usage: 0x10000 to 0x100FF on socket 0's PCIE RC-1:: perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/ + +CPU Memory (CMEM) Latency PMU +----------------------------- + +This PMU monitors latency events of memory read requests from the edge of the +Unified Coherence Fabric (UCF) to local CPU DRAM: + + * RD_REQ counters: count read requests (32B per request). + * RD_CUM_OUTS counters: accumulated outstanding request counter, which track + how many cycles the read requests are in flight. + * CYCLES counter: counts the number of elapsed cycles. + +The average latency is calculated as:: + + FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ + AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_. + +Example usage:: + + perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}' diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 638321fc9800..26e86067d8f9 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -311,4 +311,11 @@ config MARVELL_PEM_PMU Enable support for PCIe Interface performance monitoring on Marvell platform. +config NVIDIA_TEGRA410_CMEM_LATENCY_PMU + tristate "NVIDIA Tegra410 CPU Memory Latency PMU" + depends on ARM64 && ACPI + help + Enable perf support for CPU memory latency counters monitoring on + NVIDIA Tegra410 SoC. + endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index ea52711a87e3..4aa6aad393c2 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -35,3 +35,4 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o +obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c new file mode 100644 index 000000000000..acb8f5571522 --- /dev/null +++ b/drivers/perf/nvidia_t410_cmem_latency_pmu.c @@ -0,0 +1,736 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_INSTANCES 14 + +/* Register offsets. */ +#define CMEM_LAT_CG_CTRL 0x800 +#define CMEM_LAT_CTRL 0x808 +#define CMEM_LAT_STATUS 0x810 +#define CMEM_LAT_CYCLE_CNTR 0x818 +#define CMEM_LAT_MC0_REQ_CNTR 0x820 +#define CMEM_LAT_MC0_AOR_CNTR 0x830 +#define CMEM_LAT_MC1_REQ_CNTR 0x838 +#define CMEM_LAT_MC1_AOR_CNTR 0x848 +#define CMEM_LAT_MC2_REQ_CNTR 0x850 +#define CMEM_LAT_MC2_AOR_CNTR 0x860 + +/* CMEM_LAT_CTRL values. */ +#define CMEM_LAT_CTRL_DISABLE 0x0ULL +#define CMEM_LAT_CTRL_ENABLE 0x1ULL +#define CMEM_LAT_CTRL_CLR 0x2ULL + +/* CMEM_LAT_CG_CTRL values. */ +#define CMEM_LAT_CG_CTRL_DISABLE 0x0ULL +#define CMEM_LAT_CG_CTRL_ENABLE 0x1ULL + +/* CMEM_LAT_STATUS register field. */ +#define CMEM_LAT_STATUS_CYCLE_OVF BIT(0) +#define CMEM_LAT_STATUS_MC0_AOR_OVF BIT(1) +#define CMEM_LAT_STATUS_MC0_REQ_OVF BIT(3) +#define CMEM_LAT_STATUS_MC1_AOR_OVF BIT(4) +#define CMEM_LAT_STATUS_MC1_REQ_OVF BIT(6) +#define CMEM_LAT_STATUS_MC2_AOR_OVF BIT(7) +#define CMEM_LAT_STATUS_MC2_REQ_OVF BIT(9) + +/* Events. */ +#define CMEM_LAT_EVENT_CYCLES 0x0 +#define CMEM_LAT_EVENT_REQ 0x1 +#define CMEM_LAT_EVENT_AOR 0x2 + +#define CMEM_LAT_NUM_EVENTS 0x3 +#define CMEM_LAT_MASK_EVENT 0x3 +#define CMEM_LAT_MAX_ACTIVE_EVENTS 32 + +#define CMEM_LAT_ACTIVE_CPU_MASK 0x0 +#define CMEM_LAT_ASSOCIATED_CPU_MASK 0x1 + +static unsigned long cmem_lat_pmu_cpuhp_state; + +struct cmem_lat_pmu_hw_events { + struct perf_event *events[CMEM_LAT_MAX_ACTIVE_EVENTS]; + DECLARE_BITMAP(used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS); +}; + +struct cmem_lat_pmu { + struct pmu pmu; + struct device *dev; + const char *name; + const char *identifier; + void __iomem *base_broadcast; + void __iomem *base[NUM_INSTANCES]; + cpumask_t associated_cpus; + cpumask_t active_cpu; + struct hlist_node node; + struct cmem_lat_pmu_hw_events hw_events; +}; + +#define to_cmem_lat_pmu(p) \ + container_of(p, struct cmem_lat_pmu, pmu) + + +/* Get event type from perf_event. */ +static inline u32 get_event_type(struct perf_event *event) +{ + return (event->attr.config) & CMEM_LAT_MASK_EVENT; +} + +/* PMU operations. */ +static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events, + struct perf_event *event) +{ + unsigned int idx; + + idx = find_first_zero_bit(hw_events->used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS); + if (idx >= CMEM_LAT_MAX_ACTIVE_EVENTS) + return -EAGAIN; + + set_bit(idx, hw_events->used_ctrs); + + return idx; +} + +static bool cmem_lat_pmu_validate_event(struct pmu *pmu, + struct cmem_lat_pmu_hw_events *hw_events, + struct perf_event *event) +{ + int ret; + + if (is_software_event(event)) + return true; + + /* Reject groups spanning multiple HW PMUs. */ + if (event->pmu != pmu) + return false; + + ret = cmem_lat_pmu_get_event_idx(hw_events, event); + if (ret < 0) + return false; + + return true; +} + +/* Make sure the group of events can be scheduled at once on the PMU. */ +static bool cmem_lat_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cmem_lat_pmu_hw_events fake_hw_events; + + if (event->group_leader == event) + return true; + + memset(&fake_hw_events, 0, sizeof(fake_hw_events)); + + if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader)) + return false; + + for_each_sibling_event(sibling, leader) { + if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, sibling)) + return false; + } + + return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event); +} + +static int cmem_lat_pmu_event_init(struct perf_event *event) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u32 event_type = get_event_type(event); + + if (event->attr.type != event->pmu->type || + event_type >= CMEM_LAT_NUM_EVENTS) + return -ENOENT; + + /* + * Sampling, per-process mode, and per-task counters are not supported + * since this PMU is shared across all CPUs. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(cmem_lat_pmu->pmu.dev, + "Can't support sampling and per-process mode\n"); + return -EOPNOTSUPP; + } + + if (event->cpu < 0) { + dev_dbg(cmem_lat_pmu->pmu.dev, "Can't support per-task counters\n"); + return -EINVAL; + } + + /* + * Make sure the CPU assignment is on one of the CPUs associated with + * this PMU. + */ + if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) { + dev_dbg(cmem_lat_pmu->pmu.dev, + "Requested cpu is not associated with the PMU\n"); + return -EINVAL; + } + + /* Enforce the current active CPU to handle the events in this PMU. */ + event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu); + if (event->cpu >= nr_cpu_ids) + return -EINVAL; + + if (!cmem_lat_pmu_validate_group(event)) + return -EINVAL; + + hwc->idx = -1; + hwc->config = event_type; + + return 0; +} + +static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu, + unsigned int inst) +{ + return readq(cmem_lat_pmu->base[inst] + CMEM_LAT_STATUS); +} + +static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event) +{ + const unsigned int instance = 0; + u64 status; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* + * Use the reading from first instance since all instances are + * identical. + */ + status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance); + if (status & CMEM_LAT_STATUS_CYCLE_OVF) + dev_warn(dev, "Cycle counter overflow\n"); + + return readq(cmem_lat_pmu->base[instance] + CMEM_LAT_CYCLE_CNTR); +} + +static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event) +{ + unsigned int i; + u64 status, val = 0; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* Sum up the counts from all instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); + if (status & CMEM_LAT_STATUS_MC0_REQ_OVF) + dev_warn(dev, "MC0 request counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC1_REQ_OVF) + dev_warn(dev, "MC1 request counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC2_REQ_OVF) + dev_warn(dev, "MC2 request counter overflow\n"); + + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_REQ_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_REQ_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_REQ_CNTR); + } + + return val; +} + +static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event) +{ + unsigned int i; + u64 status, val = 0; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* Sum up the counts from all instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); + if (status & CMEM_LAT_STATUS_MC0_AOR_OVF) + dev_warn(dev, "MC0 AOR counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC1_AOR_OVF) + dev_warn(dev, "MC1 AOR counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC2_AOR_OVF) + dev_warn(dev, "MC2 AOR counter overflow\n"); + + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_AOR_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_AOR_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_AOR_CNTR); + } + + return val; +} + +static u64 (*read_counter_fn[CMEM_LAT_NUM_EVENTS])(struct perf_event *) = { + [CMEM_LAT_EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter, + [CMEM_LAT_EVENT_REQ] = cmem_lat_pmu_read_req_counter, + [CMEM_LAT_EVENT_AOR] = cmem_lat_pmu_read_aor_counter, +}; + +static void cmem_lat_pmu_event_update(struct perf_event *event) +{ + u32 event_type; + u64 prev, now; + struct hw_perf_event *hwc = &event->hw; + + if (hwc->state & PERF_HES_STOPPED) + return; + + event_type = hwc->config; + + do { + prev = local64_read(&hwc->prev_count); + now = read_counter_fn[event_type](event); + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); + + local64_add(now - prev, &event->count); + + hwc->state |= PERF_HES_UPTODATE; +} + +static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags) +{ + event->hw.state = 0; +} + +static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags) +{ + event->hw.state |= PERF_HES_STOPPED; +} + +static int cmem_lat_pmu_add(struct perf_event *event, int flags) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx; + + if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &cmem_lat_pmu->associated_cpus))) + return -ENOENT; + + idx = cmem_lat_pmu_get_event_idx(hw_events, event); + if (idx < 0) + return idx; + + hw_events->events[idx] = event; + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (flags & PERF_EF_START) + cmem_lat_pmu_start(event, PERF_EF_RELOAD); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void cmem_lat_pmu_del(struct perf_event *event, int flags) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + cmem_lat_pmu_stop(event, PERF_EF_UPDATE); + + hw_events->events[idx] = NULL; + + clear_bit(idx, hw_events->used_ctrs); + + perf_event_update_userpage(event); +} + +static void cmem_lat_pmu_read(struct perf_event *event) +{ + cmem_lat_pmu_event_update(event); +} + +static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, + u64 val) +{ + writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CG_CTRL); +} + +static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val) +{ + writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CTRL); +} + +static void cmem_lat_pmu_enable(struct pmu *pmu) +{ + bool disabled; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + + disabled = bitmap_empty(cmem_lat_pmu->hw_events.used_ctrs, + CMEM_LAT_MAX_ACTIVE_EVENTS); + + if (disabled) + return; + + /* Enable all the counters. */ + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE); + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_ENABLE); +} + +static void cmem_lat_pmu_disable(struct pmu *pmu) +{ + int idx; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + + /* Disable all the counters. */ + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_DISABLE); + + /* + * The counters will start from 0 again on restart. + * Update the events immediately to avoid losing the counts. + */ + for_each_set_bit(idx, cmem_lat_pmu->hw_events.used_ctrs, + CMEM_LAT_MAX_ACTIVE_EVENTS) { + struct perf_event *event = cmem_lat_pmu->hw_events.events[idx]; + + if (!event) + continue; + + cmem_lat_pmu_event_update(event); + + local64_set(&event->hw.prev_count, 0ULL); + } + + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR); + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE); +} + +/* PMU identifier attribute. */ + +static ssize_t cmem_lat_pmu_identifier_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier); +} + +static struct device_attribute cmem_lat_pmu_identifier_attr = + __ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL); + +static struct attribute *cmem_lat_pmu_identifier_attrs[] = { + &cmem_lat_pmu_identifier_attr.attr, + NULL +}; + +static struct attribute_group cmem_lat_pmu_identifier_attr_group = { + .attrs = cmem_lat_pmu_identifier_attrs, +}; + +/* Format attributes. */ + +#define NV_PMU_EXT_ATTR(_name, _func, _config) \ + (&((struct dev_ext_attribute[]){ \ + { \ + .attr = __ATTR(_name, 0444, _func, NULL), \ + .var = (void *)_config \ + } \ + })[0].attr.attr) + +static struct attribute *cmem_lat_pmu_formats[] = { + NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_format_group = { + .name = "format", + .attrs = cmem_lat_pmu_formats, +}; + +/* Event attributes. */ + +static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, typeof(*pmu_attr), attr); + return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); +} + +#define NV_PMU_EVENT_ATTR(_name, _config) \ + PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config) + +static struct attribute *cmem_lat_pmu_events[] = { + NV_PMU_EVENT_ATTR(cycles, CMEM_LAT_EVENT_CYCLES), + NV_PMU_EVENT_ATTR(rd_req, CMEM_LAT_EVENT_REQ), + NV_PMU_EVENT_ATTR(rd_cum_outs, CMEM_LAT_EVENT_AOR), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_events_group = { + .name = "events", + .attrs = cmem_lat_pmu_events, +}; + +/* Cpumask attributes. */ + +static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + struct dev_ext_attribute *eattr = + container_of(attr, struct dev_ext_attribute, attr); + unsigned long mask_id = (unsigned long)eattr->var; + const cpumask_t *cpumask; + + switch (mask_id) { + case CMEM_LAT_ACTIVE_CPU_MASK: + cpumask = &cmem_lat_pmu->active_cpu; + break; + case CMEM_LAT_ASSOCIATED_CPU_MASK: + cpumask = &cmem_lat_pmu->associated_cpus; + break; + default: + return 0; + } + return cpumap_print_to_pagebuf(true, buf, cpumask); +} + +#define NV_PMU_CPUMASK_ATTR(_name, _config) \ + NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show, \ + (unsigned long)_config) + +static struct attribute *cmem_lat_pmu_cpumask_attrs[] = { + NV_PMU_CPUMASK_ATTR(cpumask, CMEM_LAT_ACTIVE_CPU_MASK), + NV_PMU_CPUMASK_ATTR(associated_cpus, CMEM_LAT_ASSOCIATED_CPU_MASK), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = { + .attrs = cmem_lat_pmu_cpumask_attrs, +}; + +/* Per PMU device attribute groups. */ + +static const struct attribute_group *cmem_lat_pmu_attr_groups[] = { + &cmem_lat_pmu_identifier_attr_group, + &cmem_lat_pmu_format_group, + &cmem_lat_pmu_events_group, + &cmem_lat_pmu_cpumask_attr_group, + NULL +}; + +static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct cmem_lat_pmu *cmem_lat_pmu = + hlist_entry_safe(node, struct cmem_lat_pmu, node); + + if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus)) + return 0; + + /* If the PMU is already managed, there is nothing to do */ + if (!cpumask_empty(&cmem_lat_pmu->active_cpu)) + return 0; + + /* Use this CPU for event counting */ + cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu); + + return 0; +} + +static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + unsigned int dst; + + struct cmem_lat_pmu *cmem_lat_pmu = + hlist_entry_safe(node, struct cmem_lat_pmu, node); + + /* Nothing to do if this CPU doesn't own the PMU */ + if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu)) + return 0; + + /* Choose a new CPU to migrate ownership of the PMU to */ + dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus, + cpu_online_mask, cpu); + if (dst >= nr_cpu_ids) + return 0; + + /* Use this CPU for event counting */ + perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst); + cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu); + + return 0; +} + +static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu, + unsigned int socket) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_node(cpu) == socket) + cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus); + } + + if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) { + dev_dbg(cmem_lat_pmu->dev, + "No cpu associated with PMU socket-%u\n", socket); + return -ENODEV; + } + + return 0; +} + +static int cmem_lat_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct acpi_device *acpi_dev; + struct cmem_lat_pmu *cmem_lat_pmu; + char *name, *uid_str; + int ret, i; + u32 socket; + + acpi_dev = ACPI_COMPANION(dev); + if (!acpi_dev) + return -ENODEV; + + uid_str = acpi_device_uid(acpi_dev); + if (!uid_str) + return -ENODEV; + + ret = kstrtou32(uid_str, 0, &socket); + if (ret) + return ret; + + cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL); + name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket); + if (!cmem_lat_pmu || !name) + return -ENOMEM; + + cmem_lat_pmu->dev = dev; + cmem_lat_pmu->name = name; + cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev); + platform_set_drvdata(pdev, cmem_lat_pmu); + + cmem_lat_pmu->pmu = (struct pmu) { + .parent = &pdev->dev, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = cmem_lat_pmu_enable, + .pmu_disable = cmem_lat_pmu_disable, + .event_init = cmem_lat_pmu_event_init, + .add = cmem_lat_pmu_add, + .del = cmem_lat_pmu_del, + .start = cmem_lat_pmu_start, + .stop = cmem_lat_pmu_stop, + .read = cmem_lat_pmu_read, + .attr_groups = cmem_lat_pmu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | + PERF_PMU_CAP_NO_INTERRUPT, + }; + + /* Map the address of all the instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); + if (IS_ERR(cmem_lat_pmu->base[i])) { + dev_err(dev, "Failed map address for instance %d\n", i); + return PTR_ERR(cmem_lat_pmu->base[i]); + } + } + + /* Map broadcast address. */ + cmem_lat_pmu->base_broadcast = devm_platform_ioremap_resource(pdev, + NUM_INSTANCES); + if (IS_ERR(cmem_lat_pmu->base_broadcast)) { + dev_err(dev, "Failed map broadcast address\n"); + return PTR_ERR(cmem_lat_pmu->base_broadcast); + } + + ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket); + if (ret) + return ret; + + ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); + if (ret) { + dev_err(&pdev->dev, "Error %d registering hotplug\n", ret); + return ret; + } + + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE); + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR); + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE); + + ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1); + if (ret) { + dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret); + cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); + return ret; + } + + dev_dbg(&pdev->dev, "Registered %s PMU\n", name); + + return 0; +} + +static void cmem_lat_pmu_device_remove(struct platform_device *pdev) +{ + struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&cmem_lat_pmu->pmu); + cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); +} + +static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = { + { "NVDA2021" }, + { } +}; +MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match); + +static struct platform_driver cmem_lat_pmu_driver = { + .driver = { + .name = "nvidia-t410-cmem-latency-pmu", + .acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match), + .suppress_bind_attrs = true, + }, + .probe = cmem_lat_pmu_probe, + .remove = cmem_lat_pmu_device_remove, +}; + +static int __init cmem_lat_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/nvidia/cmem_latency:online", + cmem_lat_pmu_cpu_online, + cmem_lat_pmu_cpu_teardown); + if (ret < 0) + return ret; + + cmem_lat_pmu_cpuhp_state = ret; + + return platform_driver_register(&cmem_lat_pmu_driver); +} + +static void __exit cmem_lat_pmu_exit(void) +{ + platform_driver_unregister(&cmem_lat_pmu_driver); + cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state); +} + +module_init(cmem_lat_pmu_init); +module_exit(cmem_lat_pmu_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver"); +MODULE_AUTHOR("Besar Wicaksono "); From 2f89b7f78c50ca973ca035ceb30426f78d9e0996 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 24 Mar 2026 01:29:51 +0000 Subject: [PATCH 030/109] perf: add NVIDIA Tegra410 C2C PMU Adds NVIDIA C2C PMU support in Tegra410 SOC. This PMU is used to measure memory latency between the SOC and device memory, e.g GPU Memory (GMEM), CXL Memory, or memory on remote Tegra410 SOC. Reviewed-by: Ilkka Koskinen Signed-off-by: Besar Wicaksono Signed-off-by: Will Deacon --- .../admin-guide/perf/nvidia-tegra410-pmu.rst | 151 +++ drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/nvidia_t410_c2c_pmu.c | 1051 +++++++++++++++++ 4 files changed, 1210 insertions(+) create mode 100644 drivers/perf/nvidia_t410_c2c_pmu.c diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 9945c43f6a7a..0656223b61d4 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -9,6 +9,9 @@ metrics like memory bandwidth, latency, and utilization: * PCIE * PCIE-TGT * CPU Memory (CMEM) Latency +* NVLink-C2C +* NV-CLink +* NV-DLink PMU Driver ---------- @@ -369,3 +372,151 @@ see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_. Example usage:: perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}' + +NVLink-C2C PMU +-------------- + +This PMU monitors latency events of memory read/write requests that pass through +the NVIDIA Chip-to-Chip (C2C) interface. Bandwidth events are not available +in this PMU, unlike the C2C PMU in Grace (Tegra241 SoC). + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. + * IN_RD_REQ: the number of incoming read requests. + * IN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests. + * IN_WR_REQ: the number of incoming write requests. + * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. + * OUT_RD_REQ: the number of outgoing read requests. + * OUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests. + * OUT_WR_REQ: the number of outgoing write requests. + * CYCLES: NVLink-C2C interface cycle counts. + +The incoming events count the reads/writes from remote device to the SoC. +The outgoing events count the reads/writes from the SoC to remote device. + +The sysfs /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_/peer +contains the information about the connected device. + +When the C2C interface is connected to GPU(s), the user can use the +"gpu_mask" parameter to filter traffic to/from specific GPU(s). Each bit represents the GPU +index, e.g. "gpu_mask=0x1" corresponds to GPU 0 and "gpu_mask=0x3" is for GPU 0 and 1. +The PMU will monitor all GPUs by default if not specified. + +When connected to another SoC, only the read events are available. + +The events can be used to calculate the average latency of the read/write requests:: + + C2C_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + IN_WR_AVG_LATENCY_IN_CYCLES = IN_WR_CUM_OUTS / IN_WR_REQ + IN_WR_AVG_LATENCY_IN_NS = IN_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ + OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + OUT_WR_AVG_LATENCY_IN_CYCLES = OUT_WR_CUM_OUTS / OUT_WR_REQ + OUT_WR_AVG_LATENCY_IN_NS = OUT_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + +Example usage: + + * Count incoming traffic from all GPUs connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/ + + * Count incoming traffic from GPU 0 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/ + + * Count incoming traffic from GPU 1 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/ + + * Count outgoing traffic to all GPUs connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/ + + * Count outgoing traffic to GPU 0 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/ + + * Count outgoing traffic to GPU 1 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/ + +NV-CLink PMU +------------ + +This PMU monitors latency events of memory read requests that pass through +the NV-CLINK interface. Bandwidth events are not available in this PMU. +In Tegra410 SoC, the NV-CLink interface is used to connect to another Tegra410 +SoC and this PMU only counts read traffic. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvclink_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. + * IN_RD_REQ: the number of incoming read requests. + * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. + * OUT_RD_REQ: the number of outgoing read requests. + * CYCLES: NV-CLINK interface cycle counts. + +The incoming events count the reads from remote device to the SoC. +The outgoing events count the reads from the SoC to remote device. + +The events can be used to calculate the average latency of the read requests:: + + CLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ + + OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ + OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ + +Example usage: + + * Count incoming read traffic from remote SoC connected via NV-CLINK:: + + perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/ + + * Count outgoing read traffic to remote SoC connected via NV-CLINK:: + + perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/ + +NV-DLink PMU +------------ + +This PMU monitors latency events of memory read requests that pass through +the NV-DLINK interface. Bandwidth events are not available in this PMU. +In Tegra410 SoC, this PMU only counts CXL memory read traffic. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvdlink_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory. + * IN_RD_REQ: the number of read requests to CXL memory. + * CYCLES: NV-DLINK interface cycle counts. + +The events can be used to calculate the average latency of the read requests:: + + DLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / DLINK_FREQ_IN_GHZ + +Example usage: + + * Count read events to CXL memory:: + + perf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}' diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 26e86067d8f9..ab90932fc2d0 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -318,4 +318,11 @@ config NVIDIA_TEGRA410_CMEM_LATENCY_PMU Enable perf support for CPU memory latency counters monitoring on NVIDIA Tegra410 SoC. +config NVIDIA_TEGRA410_C2C_PMU + tristate "NVIDIA Tegra410 C2C PMU" + depends on ARM64 && ACPI + help + Enable perf support for counters in NVIDIA C2C interface of NVIDIA + Tegra410 SoC. + endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 4aa6aad393c2..eb8a022dad9a 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o +obj-$(CONFIG_NVIDIA_TEGRA410_C2C_PMU) += nvidia_t410_c2c_pmu.o diff --git a/drivers/perf/nvidia_t410_c2c_pmu.c b/drivers/perf/nvidia_t410_c2c_pmu.c new file mode 100644 index 000000000000..411987153ff3 --- /dev/null +++ b/drivers/perf/nvidia_t410_c2c_pmu.c @@ -0,0 +1,1051 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVIDIA Tegra410 C2C PMU driver. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The C2C interface types in Tegra410. */ +#define C2C_TYPE_NVLINK 0x0 +#define C2C_TYPE_NVCLINK 0x1 +#define C2C_TYPE_NVDLINK 0x2 +#define C2C_TYPE_COUNT 0x3 + +/* The type of the peer device connected to the C2C interface. */ +#define C2C_PEER_TYPE_CPU 0x0 +#define C2C_PEER_TYPE_GPU 0x1 +#define C2C_PEER_TYPE_CXLMEM 0x2 +#define C2C_PEER_TYPE_COUNT 0x3 + +/* The number of peer devices can be connected to the C2C interface. */ +#define C2C_NR_PEER_CPU 0x1 +#define C2C_NR_PEER_GPU 0x2 +#define C2C_NR_PEER_CXLMEM 0x1 +#define C2C_NR_PEER_MAX 0x2 + +/* Number of instances on each interface. */ +#define C2C_NR_INST_NVLINK 14 +#define C2C_NR_INST_NVCLINK 12 +#define C2C_NR_INST_NVDLINK 16 +#define C2C_NR_INST_MAX 16 + +/* Register offsets. */ +#define C2C_CTRL 0x864 +#define C2C_IN_STATUS 0x868 +#define C2C_CYCLE_CNTR 0x86c +#define C2C_IN_RD_CUM_OUTS_CNTR 0x874 +#define C2C_IN_RD_REQ_CNTR 0x87c +#define C2C_IN_WR_CUM_OUTS_CNTR 0x884 +#define C2C_IN_WR_REQ_CNTR 0x88c +#define C2C_OUT_STATUS 0x890 +#define C2C_OUT_RD_CUM_OUTS_CNTR 0x898 +#define C2C_OUT_RD_REQ_CNTR 0x8a0 +#define C2C_OUT_WR_CUM_OUTS_CNTR 0x8a8 +#define C2C_OUT_WR_REQ_CNTR 0x8b0 + +/* C2C_IN_STATUS register field. */ +#define C2C_IN_STATUS_CYCLE_OVF BIT(0) +#define C2C_IN_STATUS_IN_RD_CUM_OUTS_OVF BIT(1) +#define C2C_IN_STATUS_IN_RD_REQ_OVF BIT(2) +#define C2C_IN_STATUS_IN_WR_CUM_OUTS_OVF BIT(3) +#define C2C_IN_STATUS_IN_WR_REQ_OVF BIT(4) + +/* C2C_OUT_STATUS register field. */ +#define C2C_OUT_STATUS_OUT_RD_CUM_OUTS_OVF BIT(0) +#define C2C_OUT_STATUS_OUT_RD_REQ_OVF BIT(1) +#define C2C_OUT_STATUS_OUT_WR_CUM_OUTS_OVF BIT(2) +#define C2C_OUT_STATUS_OUT_WR_REQ_OVF BIT(3) + +/* Events. */ +#define C2C_EVENT_CYCLES 0x0 +#define C2C_EVENT_IN_RD_CUM_OUTS 0x1 +#define C2C_EVENT_IN_RD_REQ 0x2 +#define C2C_EVENT_IN_WR_CUM_OUTS 0x3 +#define C2C_EVENT_IN_WR_REQ 0x4 +#define C2C_EVENT_OUT_RD_CUM_OUTS 0x5 +#define C2C_EVENT_OUT_RD_REQ 0x6 +#define C2C_EVENT_OUT_WR_CUM_OUTS 0x7 +#define C2C_EVENT_OUT_WR_REQ 0x8 + +#define C2C_NUM_EVENTS 0x9 +#define C2C_MASK_EVENT 0xFF +#define C2C_MAX_ACTIVE_EVENTS 32 + +#define C2C_ACTIVE_CPU_MASK 0x0 +#define C2C_ASSOCIATED_CPU_MASK 0x1 + +/* + * Maximum poll count for reading counter value using high-low-high sequence. + */ +#define HILOHI_MAX_POLL 1000 + +static unsigned long nv_c2c_pmu_cpuhp_state; + +/* PMU descriptor. */ + +/* C2C type information. */ +struct nv_c2c_pmu_data { + unsigned int c2c_type; + unsigned int nr_inst; + const char *name_fmt; +}; + +static const struct nv_c2c_pmu_data nv_c2c_pmu_data[] = { + [C2C_TYPE_NVLINK] = { + .c2c_type = C2C_TYPE_NVLINK, + .nr_inst = C2C_NR_INST_NVLINK, + .name_fmt = "nvidia_nvlink_c2c_pmu_%u", + }, + [C2C_TYPE_NVCLINK] = { + .c2c_type = C2C_TYPE_NVCLINK, + .nr_inst = C2C_NR_INST_NVCLINK, + .name_fmt = "nvidia_nvclink_pmu_%u", + }, + [C2C_TYPE_NVDLINK] = { + .c2c_type = C2C_TYPE_NVDLINK, + .nr_inst = C2C_NR_INST_NVDLINK, + .name_fmt = "nvidia_nvdlink_pmu_%u", + }, +}; + +/* Tracks the events assigned to the PMU for a given logical index. */ +struct nv_c2c_pmu_hw_events { + /* The events that are active. */ + struct perf_event *events[C2C_MAX_ACTIVE_EVENTS]; + + /* + * Each bit indicates a logical counter is being used (or not) for an + * event. + */ + DECLARE_BITMAP(used_ctrs, C2C_MAX_ACTIVE_EVENTS); +}; + +struct nv_c2c_pmu { + struct pmu pmu; + struct device *dev; + struct acpi_device *acpi_dev; + + const char *name; + const char *identifier; + + const struct nv_c2c_pmu_data *data; + unsigned int peer_type; + unsigned int socket; + unsigned int nr_peer; + unsigned long peer_insts[C2C_NR_PEER_MAX][BITS_TO_LONGS(C2C_NR_INST_MAX)]; + u32 filter_default; + + struct nv_c2c_pmu_hw_events hw_events; + + cpumask_t associated_cpus; + cpumask_t active_cpu; + + struct hlist_node cpuhp_node; + + const struct attribute_group **attr_groups; + + void __iomem *base_broadcast; + void __iomem *base[C2C_NR_INST_MAX]; +}; + +#define to_c2c_pmu(p) (container_of(p, struct nv_c2c_pmu, pmu)) + +/* Get event type from perf_event. */ +static inline u32 get_event_type(struct perf_event *event) +{ + return (event->attr.config) & C2C_MASK_EVENT; +} + +static inline u32 get_filter_mask(struct perf_event *event) +{ + u32 filter; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + + filter = ((u32)event->attr.config1) & c2c_pmu->filter_default; + if (filter == 0) + filter = c2c_pmu->filter_default; + + return filter; +} + +/* PMU operations. */ + +static int nv_c2c_pmu_get_event_idx(struct nv_c2c_pmu_hw_events *hw_events, + struct perf_event *event) +{ + u32 idx; + + idx = find_first_zero_bit(hw_events->used_ctrs, C2C_MAX_ACTIVE_EVENTS); + if (idx >= C2C_MAX_ACTIVE_EVENTS) + return -EAGAIN; + + set_bit(idx, hw_events->used_ctrs); + + return idx; +} + +static bool +nv_c2c_pmu_validate_event(struct pmu *pmu, + struct nv_c2c_pmu_hw_events *hw_events, + struct perf_event *event) +{ + if (is_software_event(event)) + return true; + + /* Reject groups spanning multiple HW PMUs. */ + if (event->pmu != pmu) + return false; + + return nv_c2c_pmu_get_event_idx(hw_events, event) >= 0; +} + +/* + * Make sure the group of events can be scheduled at once + * on the PMU. + */ +static bool nv_c2c_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct nv_c2c_pmu_hw_events fake_hw_events; + + if (event->group_leader == event) + return true; + + memset(&fake_hw_events, 0, sizeof(fake_hw_events)); + + if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, leader)) + return false; + + for_each_sibling_event(sibling, leader) { + if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, + sibling)) + return false; + } + + return nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, event); +} + +static int nv_c2c_pmu_event_init(struct perf_event *event) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u32 event_type = get_event_type(event); + + if (event->attr.type != event->pmu->type || + event_type >= C2C_NUM_EVENTS) + return -ENOENT; + + /* + * Following other "uncore" PMUs, we do not support sampling mode or + * attach to a task (per-process mode). + */ + if (is_sampling_event(event)) { + dev_dbg(c2c_pmu->pmu.dev, "Can't support sampling events\n"); + return -EOPNOTSUPP; + } + + if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(c2c_pmu->pmu.dev, "Can't support per-task counters\n"); + return -EINVAL; + } + + /* + * Make sure the CPU assignment is on one of the CPUs associated with + * this PMU. + */ + if (!cpumask_test_cpu(event->cpu, &c2c_pmu->associated_cpus)) { + dev_dbg(c2c_pmu->pmu.dev, + "Requested cpu is not associated with the PMU\n"); + return -EINVAL; + } + + /* Enforce the current active CPU to handle the events in this PMU. */ + event->cpu = cpumask_first(&c2c_pmu->active_cpu); + if (event->cpu >= nr_cpu_ids) + return -EINVAL; + + if (!nv_c2c_pmu_validate_group(event)) + return -EINVAL; + + hwc->idx = -1; + hwc->config = event_type; + + return 0; +} + +/* + * Read 64-bit register as a pair of 32-bit registers using hi-lo-hi sequence. + */ +static u64 read_reg64_hilohi(const void __iomem *addr, u32 max_poll_count) +{ + u32 val_lo, val_hi; + u64 val; + + /* Use high-low-high sequence to avoid tearing */ + do { + if (max_poll_count-- == 0) { + pr_err("NV C2C PMU: timeout hi-low-high sequence\n"); + return 0; + } + + val_hi = readl(addr + 4); + val_lo = readl(addr); + } while (val_hi != readl(addr + 4)); + + val = (((u64)val_hi << 32) | val_lo); + + return val; +} + +static void nv_c2c_pmu_check_status(struct nv_c2c_pmu *c2c_pmu, u32 instance) +{ + u32 in_status, out_status; + + in_status = readl(c2c_pmu->base[instance] + C2C_IN_STATUS); + out_status = readl(c2c_pmu->base[instance] + C2C_OUT_STATUS); + + if (in_status || out_status) + dev_warn(c2c_pmu->dev, + "C2C PMU overflow in: 0x%x, out: 0x%x\n", + in_status, out_status); +} + +static u32 nv_c2c_ctr_offset[C2C_NUM_EVENTS] = { + [C2C_EVENT_CYCLES] = C2C_CYCLE_CNTR, + [C2C_EVENT_IN_RD_CUM_OUTS] = C2C_IN_RD_CUM_OUTS_CNTR, + [C2C_EVENT_IN_RD_REQ] = C2C_IN_RD_REQ_CNTR, + [C2C_EVENT_IN_WR_CUM_OUTS] = C2C_IN_WR_CUM_OUTS_CNTR, + [C2C_EVENT_IN_WR_REQ] = C2C_IN_WR_REQ_CNTR, + [C2C_EVENT_OUT_RD_CUM_OUTS] = C2C_OUT_RD_CUM_OUTS_CNTR, + [C2C_EVENT_OUT_RD_REQ] = C2C_OUT_RD_REQ_CNTR, + [C2C_EVENT_OUT_WR_CUM_OUTS] = C2C_OUT_WR_CUM_OUTS_CNTR, + [C2C_EVENT_OUT_WR_REQ] = C2C_OUT_WR_REQ_CNTR, +}; + +static u64 nv_c2c_pmu_read_counter(struct perf_event *event) +{ + u32 ctr_id, ctr_offset, filter_mask, filter_idx, inst_idx; + unsigned long *inst_mask; + DECLARE_BITMAP(filter_bitmap, C2C_NR_PEER_MAX); + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + u64 val = 0; + + filter_mask = get_filter_mask(event); + bitmap_from_arr32(filter_bitmap, &filter_mask, c2c_pmu->nr_peer); + + ctr_id = event->hw.config; + ctr_offset = nv_c2c_ctr_offset[ctr_id]; + + for_each_set_bit(filter_idx, filter_bitmap, c2c_pmu->nr_peer) { + inst_mask = c2c_pmu->peer_insts[filter_idx]; + for_each_set_bit(inst_idx, inst_mask, c2c_pmu->data->nr_inst) { + nv_c2c_pmu_check_status(c2c_pmu, inst_idx); + + /* + * Each instance share same clock and the driver always + * enables all instances. So we can use the counts from + * one instance for cycle counter. + */ + if (ctr_id == C2C_EVENT_CYCLES) + return read_reg64_hilohi( + c2c_pmu->base[inst_idx] + ctr_offset, + HILOHI_MAX_POLL); + + /* + * For other events, sum up the counts from all instances. + */ + val += read_reg64_hilohi( + c2c_pmu->base[inst_idx] + ctr_offset, + HILOHI_MAX_POLL); + } + } + + return val; +} + +static void nv_c2c_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, now; + + do { + prev = local64_read(&hwc->prev_count); + now = nv_c2c_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); + + local64_add(now - prev, &event->count); +} + +static void nv_c2c_pmu_start(struct perf_event *event, int pmu_flags) +{ + event->hw.state = 0; +} + +static void nv_c2c_pmu_stop(struct perf_event *event, int pmu_flags) +{ + event->hw.state |= PERF_HES_STOPPED; +} + +static int nv_c2c_pmu_add(struct perf_event *event, int flags) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx; + + if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &c2c_pmu->associated_cpus))) + return -ENOENT; + + idx = nv_c2c_pmu_get_event_idx(hw_events, event); + if (idx < 0) + return idx; + + hw_events->events[idx] = event; + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (flags & PERF_EF_START) + nv_c2c_pmu_start(event, PERF_EF_RELOAD); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void nv_c2c_pmu_del(struct perf_event *event, int flags) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + nv_c2c_pmu_stop(event, PERF_EF_UPDATE); + + hw_events->events[idx] = NULL; + + clear_bit(idx, hw_events->used_ctrs); + + perf_event_update_userpage(event); +} + +static void nv_c2c_pmu_read(struct perf_event *event) +{ + nv_c2c_pmu_event_update(event); +} + +static void nv_c2c_pmu_enable(struct pmu *pmu) +{ + void __iomem *bcast; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + + /* Check if any filter is enabled. */ + if (bitmap_empty(c2c_pmu->hw_events.used_ctrs, C2C_MAX_ACTIVE_EVENTS)) + return; + + /* Enable all the counters. */ + bcast = c2c_pmu->base_broadcast; + writel(0x1UL, bcast + C2C_CTRL); +} + +static void nv_c2c_pmu_disable(struct pmu *pmu) +{ + unsigned int idx; + void __iomem *bcast; + struct perf_event *event; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + + /* Disable all the counters. */ + bcast = c2c_pmu->base_broadcast; + writel(0x0UL, bcast + C2C_CTRL); + + /* + * The counters will start from 0 again on restart. + * Update the events immediately to avoid losing the counts. + */ + for_each_set_bit(idx, c2c_pmu->hw_events.used_ctrs, + C2C_MAX_ACTIVE_EVENTS) { + event = c2c_pmu->hw_events.events[idx]; + + if (!event) + continue; + + nv_c2c_pmu_event_update(event); + + local64_set(&event->hw.prev_count, 0ULL); + } +} + +/* PMU identifier attribute. */ + +static ssize_t nv_c2c_pmu_identifier_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(page, "%s\n", c2c_pmu->identifier); +} + +static struct device_attribute nv_c2c_pmu_identifier_attr = + __ATTR(identifier, 0444, nv_c2c_pmu_identifier_show, NULL); + +static struct attribute *nv_c2c_pmu_identifier_attrs[] = { + &nv_c2c_pmu_identifier_attr.attr, + NULL, +}; + +static struct attribute_group nv_c2c_pmu_identifier_attr_group = { + .attrs = nv_c2c_pmu_identifier_attrs, +}; + +/* Peer attribute. */ + +static ssize_t nv_c2c_pmu_peer_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + const char *peer_type[C2C_PEER_TYPE_COUNT] = { + [C2C_PEER_TYPE_CPU] = "cpu", + [C2C_PEER_TYPE_GPU] = "gpu", + [C2C_PEER_TYPE_CXLMEM] = "cxlmem", + }; + + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); + return sysfs_emit(page, "nr_%s=%u\n", peer_type[c2c_pmu->peer_type], + c2c_pmu->nr_peer); +} + +static struct device_attribute nv_c2c_pmu_peer_attr = + __ATTR(peer, 0444, nv_c2c_pmu_peer_show, NULL); + +static struct attribute *nv_c2c_pmu_peer_attrs[] = { + &nv_c2c_pmu_peer_attr.attr, + NULL, +}; + +static struct attribute_group nv_c2c_pmu_peer_attr_group = { + .attrs = nv_c2c_pmu_peer_attrs, +}; + +/* Format attributes. */ + +#define NV_C2C_PMU_EXT_ATTR(_name, _func, _config) \ + (&((struct dev_ext_attribute[]){ \ + { \ + .attr = __ATTR(_name, 0444, _func, NULL), \ + .var = (void *)_config \ + } \ + })[0].attr.attr) + +#define NV_C2C_PMU_FORMAT_ATTR(_name, _config) \ + NV_C2C_PMU_EXT_ATTR(_name, device_show_string, _config) + +#define NV_C2C_PMU_FORMAT_EVENT_ATTR \ + NV_C2C_PMU_FORMAT_ATTR(event, "config:0-3") + +static struct attribute *nv_c2c_pmu_gpu_formats[] = { + NV_C2C_PMU_FORMAT_EVENT_ATTR, + NV_C2C_PMU_FORMAT_ATTR(gpu_mask, "config1:0-1"), + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_gpu_format_group = { + .name = "format", + .attrs = nv_c2c_pmu_gpu_formats, +}; + +static struct attribute *nv_c2c_pmu_formats[] = { + NV_C2C_PMU_FORMAT_EVENT_ATTR, + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_format_group = { + .name = "format", + .attrs = nv_c2c_pmu_formats, +}; + +/* Event attributes. */ + +static ssize_t nv_c2c_pmu_sysfs_event_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, typeof(*pmu_attr), attr); + return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); +} + +#define NV_C2C_PMU_EVENT_ATTR(_name, _config) \ + PMU_EVENT_ATTR_ID(_name, nv_c2c_pmu_sysfs_event_show, _config) + +static struct attribute *nv_c2c_pmu_gpu_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(in_wr_cum_outs, C2C_EVENT_IN_WR_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_wr_req, C2C_EVENT_IN_WR_REQ), + NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(out_wr_cum_outs, C2C_EVENT_OUT_WR_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_wr_req, C2C_EVENT_OUT_WR_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_gpu_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_gpu_events, +}; + +static struct attribute *nv_c2c_pmu_cpu_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_cpu_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_cpu_events, +}; + +static struct attribute *nv_c2c_pmu_cxlmem_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_cxlmem_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_cxlmem_events, +}; + +/* Cpumask attributes. */ + +static ssize_t nv_c2c_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + struct dev_ext_attribute *eattr = + container_of(attr, struct dev_ext_attribute, attr); + unsigned long mask_id = (unsigned long)eattr->var; + const cpumask_t *cpumask; + + switch (mask_id) { + case C2C_ACTIVE_CPU_MASK: + cpumask = &c2c_pmu->active_cpu; + break; + case C2C_ASSOCIATED_CPU_MASK: + cpumask = &c2c_pmu->associated_cpus; + break; + default: + return 0; + } + return cpumap_print_to_pagebuf(true, buf, cpumask); +} + +#define NV_C2C_PMU_CPUMASK_ATTR(_name, _config) \ + NV_C2C_PMU_EXT_ATTR(_name, nv_c2c_pmu_cpumask_show, \ + (unsigned long)_config) + +static struct attribute *nv_c2c_pmu_cpumask_attrs[] = { + NV_C2C_PMU_CPUMASK_ATTR(cpumask, C2C_ACTIVE_CPU_MASK), + NV_C2C_PMU_CPUMASK_ATTR(associated_cpus, C2C_ASSOCIATED_CPU_MASK), + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_cpumask_attr_group = { + .attrs = nv_c2c_pmu_cpumask_attrs, +}; + +/* Attribute groups for C2C PMU connecting SoC and GPU */ +static const struct attribute_group *nv_c2c_pmu_gpu_attr_groups[] = { + &nv_c2c_pmu_gpu_format_group, + &nv_c2c_pmu_gpu_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +/* Attribute groups for C2C PMU connecting multiple SoCs */ +static const struct attribute_group *nv_c2c_pmu_cpu_attr_groups[] = { + &nv_c2c_pmu_format_group, + &nv_c2c_pmu_cpu_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +/* Attribute groups for C2C PMU connecting SoC and CXLMEM */ +static const struct attribute_group *nv_c2c_pmu_cxlmem_attr_groups[] = { + &nv_c2c_pmu_format_group, + &nv_c2c_pmu_cxlmem_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +static int nv_c2c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct nv_c2c_pmu *c2c_pmu = + hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); + + if (!cpumask_test_cpu(cpu, &c2c_pmu->associated_cpus)) + return 0; + + /* If the PMU is already managed, there is nothing to do */ + if (!cpumask_empty(&c2c_pmu->active_cpu)) + return 0; + + /* Use this CPU for event counting */ + cpumask_set_cpu(cpu, &c2c_pmu->active_cpu); + + return 0; +} + +static int nv_c2c_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + unsigned int dst; + + struct nv_c2c_pmu *c2c_pmu = + hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); + + /* Nothing to do if this CPU doesn't own the PMU */ + if (!cpumask_test_and_clear_cpu(cpu, &c2c_pmu->active_cpu)) + return 0; + + /* Choose a new CPU to migrate ownership of the PMU to */ + dst = cpumask_any_and_but(&c2c_pmu->associated_cpus, + cpu_online_mask, cpu); + if (dst >= nr_cpu_ids) + return 0; + + /* Use this CPU for event counting */ + perf_pmu_migrate_context(&c2c_pmu->pmu, cpu, dst); + cpumask_set_cpu(dst, &c2c_pmu->active_cpu); + + return 0; +} + +static int nv_c2c_pmu_get_cpus(struct nv_c2c_pmu *c2c_pmu) +{ + int socket = c2c_pmu->socket, cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_node(cpu) == socket) + cpumask_set_cpu(cpu, &c2c_pmu->associated_cpus); + } + + if (cpumask_empty(&c2c_pmu->associated_cpus)) { + dev_dbg(c2c_pmu->dev, + "No cpu associated with C2C PMU socket-%u\n", socket); + return -ENODEV; + } + + return 0; +} + +static int nv_c2c_pmu_init_socket(struct nv_c2c_pmu *c2c_pmu) +{ + const char *uid_str; + int ret, socket; + + uid_str = acpi_device_uid(c2c_pmu->acpi_dev); + if (!uid_str) { + dev_err(c2c_pmu->dev, "No ACPI device UID\n"); + return -ENODEV; + } + + ret = kstrtou32(uid_str, 0, &socket); + if (ret) { + dev_err(c2c_pmu->dev, "Failed to parse ACPI device UID\n"); + return ret; + } + + c2c_pmu->socket = socket; + return 0; +} + +static int nv_c2c_pmu_init_id(struct nv_c2c_pmu *c2c_pmu) +{ + char *name; + + name = devm_kasprintf(c2c_pmu->dev, GFP_KERNEL, c2c_pmu->data->name_fmt, + c2c_pmu->socket); + if (!name) + return -ENOMEM; + + c2c_pmu->name = name; + + c2c_pmu->identifier = acpi_device_hid(c2c_pmu->acpi_dev); + + return 0; +} + +static int nv_c2c_pmu_init_filter(struct nv_c2c_pmu *c2c_pmu) +{ + u32 cpu_en = 0; + struct device *dev = c2c_pmu->dev; + const struct nv_c2c_pmu_data *data = c2c_pmu->data; + + if (data->c2c_type == C2C_TYPE_NVDLINK) { + c2c_pmu->peer_type = C2C_PEER_TYPE_CXLMEM; + + c2c_pmu->peer_insts[0][0] = (1UL << data->nr_inst) - 1; + + c2c_pmu->nr_peer = C2C_NR_PEER_CXLMEM; + c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; + + c2c_pmu->attr_groups = nv_c2c_pmu_cxlmem_attr_groups; + + return 0; + } + + if (device_property_read_u32(dev, "cpu_en_mask", &cpu_en)) + dev_dbg(dev, "no cpu_en_mask property\n"); + + if (cpu_en) { + c2c_pmu->peer_type = C2C_PEER_TYPE_CPU; + + /* Fill peer_insts bitmap with instances connected to peer CPU. */ + bitmap_from_arr32(c2c_pmu->peer_insts[0], &cpu_en, data->nr_inst); + + c2c_pmu->nr_peer = 1; + c2c_pmu->attr_groups = nv_c2c_pmu_cpu_attr_groups; + } else { + u32 i; + const char *props[C2C_NR_PEER_MAX] = { + "gpu0_en_mask", "gpu1_en_mask" + }; + + for (i = 0; i < C2C_NR_PEER_MAX; i++) { + u32 gpu_en = 0; + + if (device_property_read_u32(dev, props[i], &gpu_en)) + dev_dbg(dev, "no %s property\n", props[i]); + + if (gpu_en) { + /* Fill peer_insts bitmap with instances connected to peer GPU. */ + bitmap_from_arr32(c2c_pmu->peer_insts[i], &gpu_en, + data->nr_inst); + + c2c_pmu->nr_peer++; + } + } + + if (c2c_pmu->nr_peer == 0) { + dev_err(dev, "No GPU is enabled\n"); + return -EINVAL; + } + + c2c_pmu->peer_type = C2C_PEER_TYPE_GPU; + c2c_pmu->attr_groups = nv_c2c_pmu_gpu_attr_groups; + } + + c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; + + return 0; +} + +static void *nv_c2c_pmu_init_pmu(struct platform_device *pdev) +{ + int ret; + struct nv_c2c_pmu *c2c_pmu; + struct acpi_device *acpi_dev; + struct device *dev = &pdev->dev; + + acpi_dev = ACPI_COMPANION(dev); + if (!acpi_dev) + return ERR_PTR(-ENODEV); + + c2c_pmu = devm_kzalloc(dev, sizeof(*c2c_pmu), GFP_KERNEL); + if (!c2c_pmu) + return ERR_PTR(-ENOMEM); + + c2c_pmu->dev = dev; + c2c_pmu->acpi_dev = acpi_dev; + c2c_pmu->data = (const struct nv_c2c_pmu_data *)device_get_match_data(dev); + if (!c2c_pmu->data) + return ERR_PTR(-EINVAL); + + platform_set_drvdata(pdev, c2c_pmu); + + ret = nv_c2c_pmu_init_socket(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + ret = nv_c2c_pmu_init_id(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + ret = nv_c2c_pmu_init_filter(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + return c2c_pmu; +} + +static int nv_c2c_pmu_init_mmio(struct nv_c2c_pmu *c2c_pmu) +{ + int i; + struct device *dev = c2c_pmu->dev; + struct platform_device *pdev = to_platform_device(dev); + const struct nv_c2c_pmu_data *data = c2c_pmu->data; + + /* Map the address of all the instances. */ + for (i = 0; i < data->nr_inst; i++) { + c2c_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); + if (IS_ERR(c2c_pmu->base[i])) { + dev_err(dev, "Failed map address for instance %d\n", i); + return PTR_ERR(c2c_pmu->base[i]); + } + } + + /* Map broadcast address. */ + c2c_pmu->base_broadcast = devm_platform_ioremap_resource(pdev, + data->nr_inst); + if (IS_ERR(c2c_pmu->base_broadcast)) { + dev_err(dev, "Failed map broadcast address\n"); + return PTR_ERR(c2c_pmu->base_broadcast); + } + + return 0; +} + +static int nv_c2c_pmu_register_pmu(struct nv_c2c_pmu *c2c_pmu) +{ + int ret; + + ret = cpuhp_state_add_instance(nv_c2c_pmu_cpuhp_state, + &c2c_pmu->cpuhp_node); + if (ret) { + dev_err(c2c_pmu->dev, "Error %d registering hotplug\n", ret); + return ret; + } + + c2c_pmu->pmu = (struct pmu) { + .parent = c2c_pmu->dev, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = nv_c2c_pmu_enable, + .pmu_disable = nv_c2c_pmu_disable, + .event_init = nv_c2c_pmu_event_init, + .add = nv_c2c_pmu_add, + .del = nv_c2c_pmu_del, + .start = nv_c2c_pmu_start, + .stop = nv_c2c_pmu_stop, + .read = nv_c2c_pmu_read, + .attr_groups = c2c_pmu->attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | + PERF_PMU_CAP_NO_INTERRUPT, + }; + + ret = perf_pmu_register(&c2c_pmu->pmu, c2c_pmu->name, -1); + if (ret) { + dev_err(c2c_pmu->dev, "Failed to register C2C PMU: %d\n", ret); + cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, + &c2c_pmu->cpuhp_node); + return ret; + } + + return 0; +} + +static int nv_c2c_pmu_probe(struct platform_device *pdev) +{ + int ret; + struct nv_c2c_pmu *c2c_pmu; + + c2c_pmu = nv_c2c_pmu_init_pmu(pdev); + if (IS_ERR(c2c_pmu)) + return PTR_ERR(c2c_pmu); + + ret = nv_c2c_pmu_init_mmio(c2c_pmu); + if (ret) + return ret; + + ret = nv_c2c_pmu_get_cpus(c2c_pmu); + if (ret) + return ret; + + ret = nv_c2c_pmu_register_pmu(c2c_pmu); + if (ret) + return ret; + + dev_dbg(c2c_pmu->dev, "Registered %s PMU\n", c2c_pmu->name); + + return 0; +} + +static void nv_c2c_pmu_device_remove(struct platform_device *pdev) +{ + struct nv_c2c_pmu *c2c_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&c2c_pmu->pmu); + cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, &c2c_pmu->cpuhp_node); +} + +static const struct acpi_device_id nv_c2c_pmu_acpi_match[] = { + { "NVDA2023", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVLINK] }, + { "NVDA2022", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVCLINK] }, + { "NVDA2020", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVDLINK] }, + { } +}; +MODULE_DEVICE_TABLE(acpi, nv_c2c_pmu_acpi_match); + +static struct platform_driver nv_c2c_pmu_driver = { + .driver = { + .name = "nvidia-t410-c2c-pmu", + .acpi_match_table = nv_c2c_pmu_acpi_match, + .suppress_bind_attrs = true, + }, + .probe = nv_c2c_pmu_probe, + .remove = nv_c2c_pmu_device_remove, +}; + +static int __init nv_c2c_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/nvidia/c2c:online", + nv_c2c_pmu_online_cpu, + nv_c2c_pmu_cpu_teardown); + if (ret < 0) + return ret; + + nv_c2c_pmu_cpuhp_state = ret; + return platform_driver_register(&nv_c2c_pmu_driver); +} + +static void __exit nv_c2c_pmu_exit(void) +{ + platform_driver_unregister(&nv_c2c_pmu_driver); + cpuhp_remove_multi_state(nv_c2c_pmu_cpuhp_state); +} + +module_init(nv_c2c_pmu_init); +module_exit(nv_c2c_pmu_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NVIDIA Tegra410 C2C PMU driver"); +MODULE_AUTHOR("Besar Wicaksono "); From b7d9d2e3a8ab535ff3bc8e9156a917bb8934f856 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 23 Mar 2026 16:39:16 +0000 Subject: [PATCH 031/109] arm64: mm: __ptep_set_access_flags must hint correct TTL It has been reported that since commit 752a0d1d483e9 ("arm64: mm: Provide level hint for flush_tlb_page()"), the arm64 check_hugetlb_options selftest has been locking up while running "Check child hugetlb memory with private mapping, sync error mode and mmap memory". This is due to hugetlb (and THP) helpers casting their PMD/PUD entries to PTE and calling __ptep_set_access_flags(), which issues a __flush_tlb_page(). Now that this is hinted for level 3, in this case, the TLB entry does not get evicted and we end up in a spurious fault loop. Fix this by creating a __ptep_set_access_flags_anysz() function which takes the pgsize of the entry. It can then add the appropriate hint. The "_anysz" approach is the established pattern for problems of this class. Reported-by: Aishwarya TCV Fixes: 752a0d1d483e ("arm64: mm: Provide level hint for flush_tlb_page()") Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 19 ++++++++++++++----- arch/arm64/mm/fault.c | 30 +++++++++++++++++++++++++----- arch/arm64/mm/hugetlbpage.c | 6 +++--- 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b1a96a8f2b17..53e39d2182f5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1248,9 +1248,18 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -extern int __ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty); +extern int __ptep_set_access_flags_anysz(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty, + unsigned long pgsize); + +static inline int __ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + return __ptep_set_access_flags_anysz(vma, address, ptep, entry, dirty, + PAGE_SIZE); +} #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS @@ -1258,8 +1267,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { - return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, - pmd_pte(entry), dirty); + return __ptep_set_access_flags_anysz(vma, address, (pte_t *)pmdp, + pmd_pte(entry), dirty, PMD_SIZE); } #endif diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index f91aa686f142..920a8b244d59 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -204,12 +204,13 @@ static void show_pte(unsigned long addr) * * Returns whether or not the PTE actually changed. */ -int __ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) +int __ptep_set_access_flags_anysz(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty, unsigned long pgsize) { pteval_t old_pteval, pteval; pte_t pte = __ptep_get(ptep); + int level; if (pte_same(pte, entry)) return 0; @@ -238,8 +239,27 @@ int __ptep_set_access_flags(struct vm_area_struct *vma, * may still cause page faults and be invalidated via * flush_tlb_fix_spurious_fault(). */ - if (dirty) - __flush_tlb_page(vma, address, TLBF_NOBROADCAST); + if (dirty) { + switch (pgsize) { + case PAGE_SIZE: + level = 3; + break; + case PMD_SIZE: + level = 2; + break; +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + level = 1; + break; +#endif + default: + level = TLBI_TTL_UNKNOWN; + WARN_ON(1); + } + + __flush_tlb_range(vma, address, address + pgsize, pgsize, level, + TLBF_NOWALKCACHE | TLBF_NOBROADCAST); + } return 1; } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0b7ccd0cbb9e..30772a909aea 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -427,11 +427,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t orig_pte; VM_WARN_ON(!pte_present(pte)); + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); if (!pte_cont(pte)) - return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); - - ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); + return __ptep_set_access_flags_anysz(vma, addr, ptep, pte, + dirty, pgsize); if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; From d49802b6617b96f55d4b61fed81f4cc43858ed3f Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Thu, 26 Mar 2026 17:08:56 +0800 Subject: [PATCH 032/109] perf/arm-cmn: Fix incorrect error check for devm_ioremap() Check devm_ioremap() return value for NULL instead of ERR_PTR and return -ENOMEM on failure. devm_ioremap() never returns ERR_PTR, using IS_ERR() skips the error path and may cause a NULL pointer dereference. Fixes: 5394396ff548 ("perf/arm-cmn: Stop claiming entire iomem region") Signed-off-by: Chen Ni Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 1ac91cda6780..9fe00d0f4deb 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2573,8 +2573,8 @@ static int arm_cmn_probe(struct platform_device *pdev) /* Map the whole region now, claim the DTCs once we've found them */ cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg)); - if (IS_ERR(cmn->base)) - return PTR_ERR(cmn->base); + if (!cmn->base) + return -ENOMEM; rootnode = arm_cmn_get_root(cmn, cfg); if (rootnode < 0) From 47f06ebbe8dad695002e5d9a2ab436411f88e985 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 25 Mar 2026 19:19:26 -0700 Subject: [PATCH 033/109] perf/arm-cmn: Fix resource_size_t printk specifier in arm_cmn_init_dtc() When building for 32-bit ARM, there is a warning when using the %llx specifier to print a resource_size_t variable: drivers/perf/arm-cmn.c: In function 'arm_cmn_init_dtc': drivers/perf/arm-cmn.c:2149:73: error: format '%llx' expects argument of type 'long long unsigned int', but argument 4 has type 'resource_size_t' {aka 'unsigned int'} [-Werror=format=] 2149 | "Failed to request DTC region 0x%llx\n", base); | ~~~^ ~~~~ | | | | | resource_size_t {aka unsigned int} | long long unsigned int | %x Use the %pa specifier to handle the possible sizes of phys_addr_t properly. This requires passing the variable by reference. Fixes: 5394396ff548 ("perf/arm-cmn: Stop claiming entire iomem region") Signed-off-by: Nathan Chancellor Reviewed-by: Robin murphy Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 9fe00d0f4deb..f5305c8fdca4 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2146,7 +2146,7 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id size = cmn->part == PART_CMN600 ? SZ_16K : SZ_64K; if (!devm_request_mem_region(cmn->dev, base, size, dev_name(cmn->dev))) return dev_err_probe(cmn->dev, -EBUSY, - "Failed to request DTC region 0x%llx\n", base); + "Failed to request DTC region 0x%pa\n", &base); writel_relaxed(CMN_DT_DTC_CTL_DT_EN, dtc->base + CMN_DT_DTC_CTL); writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, CMN_DT_PMCR(dtc)); From 7181f718cb0fd47b37d13aad8744cbd6da9f1cbe Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:26 +0000 Subject: [PATCH 034/109] arm64: cpufeature: Add FEAT_LSUI Since Armv9.6, FEAT_LSUI introduces atomic instructions that allow privileged code to access user memory without clearing the PSTATE.PAN bit. Add CPU feature detection for FEAT_LSUI. Signed-off-by: Yeoreum Yun [catalin.marinas@arm.com: Remove commit log references to SW_PAN] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpucaps.h | 2 ++ arch/arm64/kernel/cpufeature.c | 10 ++++++++++ arch/arm64/tools/cpucaps | 1 + 3 files changed, 13 insertions(+) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 177c691914f8..6e3da333442e 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -71,6 +71,8 @@ cpucap_is_possible(const unsigned int cap) return true; case ARM64_HAS_PMUV3: return IS_ENABLED(CONFIG_HW_PERF_EVENTS); + case ARM64_HAS_LSUI: + return IS_ENABLED(CONFIG_ARM64_LSUI); } return true; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c31f8e17732a..5074ff32176f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -281,6 +281,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSUI_SHIFT, 4, ID_AA64ISAR3_EL1_LSUI_NI), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, @@ -3169,6 +3170,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_ls64_v, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LS64, LS64_V) }, +#ifdef CONFIG_ARM64_LSUI + { + .desc = "Unprivileged Load Store Instructions (LSUI)", + .capability = ARM64_HAS_LSUI, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR3_EL1, LSUI, IMP) + }, +#endif {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 7261553b644b..b7286d977788 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -48,6 +48,7 @@ HAS_LPA2 HAS_LSE_ATOMICS HAS_LS64 HAS_LS64_V +HAS_LSUI HAS_MOPS HAS_NESTED_VIRT HAS_BBML2_NOABORT From f6bff18d05ed40e5b06bfe9f9e5c2c7fe247164d Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:27 +0000 Subject: [PATCH 035/109] KVM: arm64: Expose FEAT_LSUI to guests Expose the FEAT_LSUI ID register field to guests. Signed-off-by: Yeoreum Yun Acked-by: Marc Zyngier Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/kvm/sys_regs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1b4cacb6e918..484f98eaf6d4 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1805,7 +1805,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, break; case SYS_ID_AA64ISAR3_EL1: val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE | - ID_AA64ISAR3_EL1_FAMINMAX; + ID_AA64ISAR3_EL1_FAMINMAX | ID_AA64ISAR3_EL1_LSUI; break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; @@ -3252,6 +3252,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64ISAR2_EL1_GPA3)), ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE | + ID_AA64ISAR3_EL1_LSUI | ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), From 42550d7d8aa66632c75f22095cb210dd431e8d71 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:28 +0000 Subject: [PATCH 036/109] KVM: arm64: kselftest: set_id_regs: Add test for FEAT_LSUI Add test coverage for FEAT_LSUI. Signed-off-by: Yeoreum Yun Reviewed-by: Mark Brown Signed-off-by: Catalin Marinas --- tools/testing/selftests/kvm/arm64/set_id_regs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 73de5be58bab..fa3478a6c914 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -124,6 +124,7 @@ static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = { static const struct reg_ftr_bits ftr_id_aa64isar3_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FPRCVT, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSUI, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSFE, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FAMINMAX, 0), REG_FTR_END, From eaa3babcceaaf191228d6ea897f677ed7549d0e2 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:29 +0000 Subject: [PATCH 037/109] arm64: futex: Refactor futex atomic operation Refactor the futex atomic operations using ll/sc instructions in preparation for FEAT_LSUI support. In addition, use named operands for the inline asm. No functional change. Signed-off-by: Yeoreum Yun [catalin.marinas@arm.com: remove unnecessary stringify.h include] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/futex.h | 155 +++++++++++++++++++++------------ 1 file changed, 97 insertions(+), 58 deletions(-) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index bc06691d2062..321242e40952 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -12,68 +12,133 @@ #define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ -#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ -do { \ +#define LLSC_FUTEX_ATOMIC_OP(op, insn) \ +static __always_inline int \ +__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ +{ \ unsigned int loops = FUTEX_MAX_LOOPS; \ + int ret, oldval, newval; \ \ uaccess_enable_privileged(); \ - asm volatile( \ -" prfm pstl1strm, %2\n" \ -"1: ldxr %w1, %2\n" \ + asm volatile("// __llsc_futex_atomic_" #op "\n" \ +" prfm pstl1strm, %[uaddr]\n" \ +"1: ldxr %w[oldval], %[uaddr]\n" \ insn "\n" \ -"2: stlxr %w0, %w3, %2\n" \ -" cbz %w0, 3f\n" \ -" sub %w4, %w4, %w0\n" \ -" cbnz %w4, 1b\n" \ -" mov %w0, %w6\n" \ +"2: stlxr %w[ret], %w[newval], %[uaddr]\n" \ +" cbz %w[ret], 3f\n" \ +" sub %w[loops], %w[loops], %w[ret]\n" \ +" cbnz %w[loops], 1b\n" \ +" mov %w[ret], %w[err]\n" \ "3:\n" \ " dmb ish\n" \ - _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) \ - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) \ - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \ - "+r" (loops) \ - : "r" (oparg), "Ir" (-EAGAIN) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) \ + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) \ + : [ret] "=&r" (ret), [oldval] "=&r" (oldval), \ + [uaddr] "+Q" (*uaddr), [newval] "=&r" (newval), \ + [loops] "+r" (loops) \ + : [oparg] "r" (oparg), [err] "Ir" (-EAGAIN) \ : "memory"); \ uaccess_disable_privileged(); \ -} while (0) + \ + if (!ret) \ + *oval = oldval; \ + \ + return ret; \ +} + +LLSC_FUTEX_ATOMIC_OP(add, "add %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(or, "orr %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(and, "and %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(eor, "eor %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(set, "mov %w[newval], %w[oparg]") + +static __always_inline int +__llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + int ret = 0; + unsigned int loops = FUTEX_MAX_LOOPS; + u32 val, tmp; + + uaccess_enable_privileged(); + asm volatile("//__llsc_futex_cmpxchg\n" +" prfm pstl1strm, %[uaddr]\n" +"1: ldxr %w[curval], %[uaddr]\n" +" eor %w[tmp], %w[curval], %w[oldval]\n" +" cbnz %w[tmp], 4f\n" +"2: stlxr %w[tmp], %w[newval], %[uaddr]\n" +" cbz %w[tmp], 3f\n" +" sub %w[loops], %w[loops], %w[tmp]\n" +" cbnz %w[loops], 1b\n" +" mov %w[ret], %w[err]\n" +"3:\n" +" dmb ish\n" +"4:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w[ret]) + : [ret] "+r" (ret), [curval] "=&r" (val), + [uaddr] "+Q" (*uaddr), [tmp] "=&r" (tmp), + [loops] "+r" (loops) + : [oldval] "r" (oldval), [newval] "r" (newval), + [err] "Ir" (-EAGAIN) + : "memory"); + uaccess_disable_privileged(); + + if (!ret) + *oval = val; + + return ret; +} + +#define FUTEX_ATOMIC_OP(op) \ +static __always_inline int \ +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ +{ \ + return __llsc_futex_atomic_##op(oparg, uaddr, oval); \ +} + +FUTEX_ATOMIC_OP(add) +FUTEX_ATOMIC_OP(or) +FUTEX_ATOMIC_OP(and) +FUTEX_ATOMIC_OP(eor) +FUTEX_ATOMIC_OP(set) + +static __always_inline int +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval); +} static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr) { - int oldval = 0, ret, tmp; - u32 __user *uaddr = __uaccess_mask_ptr(_uaddr); + int ret; + u32 __user *uaddr; if (!access_ok(_uaddr, sizeof(u32))) return -EFAULT; + uaddr = __uaccess_mask_ptr(_uaddr); + switch (op) { case FUTEX_OP_SET: - __futex_atomic_op("mov %w3, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_set(oparg, uaddr, oval); break; case FUTEX_OP_ADD: - __futex_atomic_op("add %w3, %w1, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_add(oparg, uaddr, oval); break; case FUTEX_OP_OR: - __futex_atomic_op("orr %w3, %w1, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_or(oparg, uaddr, oval); break; case FUTEX_OP_ANDN: - __futex_atomic_op("and %w3, %w1, %w5", - ret, oldval, uaddr, tmp, ~oparg); + ret = __futex_atomic_and(~oparg, uaddr, oval); break; case FUTEX_OP_XOR: - __futex_atomic_op("eor %w3, %w1, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_eor(oparg, uaddr, oval); break; default: ret = -ENOSYS; } - if (!ret) - *oval = oldval; - return ret; } @@ -81,40 +146,14 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, u32 oldval, u32 newval) { - int ret = 0; - unsigned int loops = FUTEX_MAX_LOOPS; - u32 val, tmp; u32 __user *uaddr; if (!access_ok(_uaddr, sizeof(u32))) return -EFAULT; uaddr = __uaccess_mask_ptr(_uaddr); - uaccess_enable_privileged(); - asm volatile("// futex_atomic_cmpxchg_inatomic\n" -" prfm pstl1strm, %2\n" -"1: ldxr %w1, %2\n" -" sub %w3, %w1, %w5\n" -" cbnz %w3, 4f\n" -"2: stlxr %w3, %w6, %2\n" -" cbz %w3, 3f\n" -" sub %w4, %w4, %w3\n" -" cbnz %w4, 1b\n" -" mov %w0, %w7\n" -"3:\n" -" dmb ish\n" -"4:\n" - _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) - : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) - : "memory"); - uaccess_disable_privileged(); - if (!ret) - *uval = val; - - return ret; + return __futex_cmpxchg(uaddr, oldval, newval, uval); } #endif /* __ASM_FUTEX_H */ From 44adf2bf40efe56c68c1563779ab2047eb0a57ea Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:30 +0000 Subject: [PATCH 038/109] arm64: futex: Support futex with FEAT_LSUI Current futex atomic operations are implemented using LL/SC instructions while temporarily clearing PSTATE.PAN and setting PSTATE.TCO (if KASAN_HW_TAGS is enabled). With Armv9.6, FEAT_LSUI provides atomic instructions for user memory access in the kernel without the need for PSTATE bits toggling. Use the FEAT_LSUI instructions to implement the futex atomic operations. Note that some futex operations do not have a matching LSUI instruction, (eor or word-sized cmpxchg). For such cases, use cas{al}t to implement the operation. Signed-off-by: Yeoreum Yun [catalin.marinas@arm.com: add comment on -EAGAIN in __lsui_futex_cmpxchg()] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/futex.h | 160 ++++++++++++++++++++++++++++++++- arch/arm64/include/asm/lsui.h | 27 ++++++ 2 files changed, 185 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/lsui.h diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 321242e40952..d1d2ff9d323a 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -9,6 +9,7 @@ #include #include +#include #define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ @@ -89,11 +90,166 @@ __llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) return ret; } +#ifdef CONFIG_ARM64_LSUI + +/* + * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), as + * PAN toggling is not required. + */ + +#define LSUI_FUTEX_ATOMIC_OP(op, asm_op) \ +static __always_inline int \ +__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ +{ \ + int ret = 0; \ + int oldval; \ + \ + uaccess_ttbr0_enable(); \ + \ + asm volatile("// __lsui_futex_atomic_" #op "\n" \ + __LSUI_PREAMBLE \ +"1: " #asm_op "al %w[oparg], %w[oldval], %[uaddr]\n" \ +"2:\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) \ + : [ret] "+r" (ret), [uaddr] "+Q" (*uaddr), \ + [oldval] "=r" (oldval) \ + : [oparg] "r" (oparg) \ + : "memory"); \ + \ + uaccess_ttbr0_disable(); \ + \ + if (!ret) \ + *oval = oldval; \ + return ret; \ +} + +LSUI_FUTEX_ATOMIC_OP(add, ldtadd) +LSUI_FUTEX_ATOMIC_OP(or, ldtset) +LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr) +LSUI_FUTEX_ATOMIC_OP(set, swpt) + +static __always_inline int +__lsui_cmpxchg64(u64 __user *uaddr, u64 *oldval, u64 newval) +{ + int ret = 0; + + uaccess_ttbr0_enable(); + + asm volatile("// __lsui_cmpxchg64\n" + __LSUI_PREAMBLE +"1: casalt %[oldval], %[newval], %[uaddr]\n" +"2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [ret] "+r" (ret), [uaddr] "+Q" (*uaddr), + [oldval] "+r" (*oldval) + : [newval] "r" (newval) + : "memory"); + + uaccess_ttbr0_disable(); + + return ret; +} + +static __always_inline int +__lsui_cmpxchg32(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + u64 __user *uaddr64; + bool futex_pos, other_pos; + u32 other, orig_other; + union { + u32 futex[2]; + u64 raw; + } oval64, orig64, nval64; + + uaddr64 = (u64 __user *)PTR_ALIGN_DOWN(uaddr, sizeof(u64)); + futex_pos = !IS_ALIGNED((unsigned long)uaddr, sizeof(u64)); + other_pos = !futex_pos; + + oval64.futex[futex_pos] = oldval; + if (get_user(oval64.futex[other_pos], (u32 __user *)uaddr64 + other_pos)) + return -EFAULT; + + orig64.raw = oval64.raw; + + nval64.futex[futex_pos] = newval; + nval64.futex[other_pos] = oval64.futex[other_pos]; + + if (__lsui_cmpxchg64(uaddr64, &oval64.raw, nval64.raw)) + return -EFAULT; + + oldval = oval64.futex[futex_pos]; + other = oval64.futex[other_pos]; + orig_other = orig64.futex[other_pos]; + + if (other != orig_other) + return -EAGAIN; + + *oval = oldval; + + return 0; +} + +static __always_inline int +__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) +{ + /* + * Undo the bitwise negation applied to the oparg passed from + * arch_futex_atomic_op_inuser() with FUTEX_OP_ANDN. + */ + return __lsui_futex_atomic_andnot(~oparg, uaddr, oval); +} + +static __always_inline int +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) +{ + u32 oldval, newval, val; + int ret, i; + + if (get_user(oldval, uaddr)) + return -EFAULT; + + /* + * there are no ldteor/stteor instructions... + */ + for (i = 0; i < FUTEX_MAX_LOOPS; i++) { + newval = oldval ^ oparg; + + ret = __lsui_cmpxchg32(uaddr, oldval, newval, &val); + switch (ret) { + case -EFAULT: + return ret; + case -EAGAIN: + continue; + } + + if (val == oldval) { + *oval = val; + return 0; + } + + oldval = val; + } + + return -EAGAIN; +} + +static __always_inline int +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + /* + * Callers of futex_atomic_cmpxchg_inatomic() already retry on + * -EAGAIN, no need for another loop of max retries. + */ + return __lsui_cmpxchg32(uaddr, oldval, newval, oval); +} +#endif /* CONFIG_ARM64_LSUI */ + + #define FUTEX_ATOMIC_OP(op) \ static __always_inline int \ __futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ { \ - return __llsc_futex_atomic_##op(oparg, uaddr, oval); \ + return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval); \ } FUTEX_ATOMIC_OP(add) @@ -105,7 +261,7 @@ FUTEX_ATOMIC_OP(set) static __always_inline int __futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) { - return __llsc_futex_cmpxchg(uaddr, oldval, newval, oval); + return __lsui_llsc_body(futex_cmpxchg, uaddr, oldval, newval, oval); } static inline int diff --git a/arch/arm64/include/asm/lsui.h b/arch/arm64/include/asm/lsui.h new file mode 100644 index 000000000000..8f0d81953eb6 --- /dev/null +++ b/arch/arm64/include/asm/lsui.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_LSUI_H +#define __ASM_LSUI_H + +#include +#include +#include +#include +#include + +#define __LSUI_PREAMBLE ".arch_extension lsui\n" + +#ifdef CONFIG_ARM64_LSUI + +#define __lsui_llsc_body(op, ...) \ +({ \ + alternative_has_cap_unlikely(ARM64_HAS_LSUI) ? \ + __lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__); \ +}) + +#else /* CONFIG_ARM64_LSUI */ + +#define __lsui_llsc_body(op, ...) __llsc_##op(__VA_ARGS__) + +#endif /* CONFIG_ARM64_LSUI */ + +#endif /* __ASM_LSUI_H */ From 16dbe77a5be2defe5797cc33da87e33f72ef150c Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:32 +0000 Subject: [PATCH 039/109] KVM: arm64: Use CAST instruction for swapping guest descriptor Use the CAST instruction to swap the guest descriptor when FEAT_LSUI is enabled, avoiding the need to clear the PAN bit. FEAT_LSUI is introduced in Armv9.6, where FEAT_PAN is mandatory. However, this assumption may not always hold: - Some CPUs may advertise FEAT_LSUI but lack FEAT_PAN. - Virtualization or ID register overrides may expose invalid feature combinations. Therefore, instead of disabling FEAT_LSUI when FEAT_PAN is absent, wrap LSUI instructions with uaccess_ttbr0_enable()/disable() when ARM64_SW_TTBR0_PAN is enabled. Signed-off-by: Yeoreum Yun Reviewed-by: Marc Zyngier Signed-off-by: Catalin Marinas --- arch/arm64/kvm/at.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 6588ea251ed7..1adf88a57328 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -9,6 +9,7 @@ #include #include #include +#include static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) { @@ -1681,6 +1682,35 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) } } +static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + /* + * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), + * as PAN toggling is not required. + */ + uaccess_ttbr0_enable(); + + asm volatile(__LSUI_PREAMBLE + "1: cast %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_ttbr0_disable(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} + static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) { u64 tmp = old; @@ -1756,7 +1786,9 @@ int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) return -EPERM; ptep = (u64 __user *)hva + offset; - if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + if (cpus_have_final_cap(ARM64_HAS_LSUI)) + r = __lsui_swap_desc(ptep, old, new); + else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) r = __lse_swap_desc(ptep, old, new); else r = __llsc_swap_desc(ptep, old, new); From 377609ae8b6ac2ef522c9dcecfa20a7123116dbf Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:33 +0000 Subject: [PATCH 040/109] arm64: Kconfig: Add support for LSUI Since Armv9.6, FEAT_LSUI supplies the load/store instructions for previleged level to access to access user memory without clearing PSTATE.PAN bit. Add Kconfig option entry for FEAT_LSUI. Signed-off-by: Yeoreum Yun Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 38dba5f7e4d2..890a1bedbf4a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2215,6 +2215,26 @@ config ARM64_GCS endmenu # "ARMv9.4 architectural features" +config AS_HAS_LSUI + def_bool $(as-instr,.arch_extension lsui) + help + Supported by LLVM 20+ and binutils 2.45+. + +menu "ARMv9.6 architectural features" + +config ARM64_LSUI + bool "Support Unprivileged Load Store Instructions (LSUI)" + default y + depends on AS_HAS_LSUI && !CPU_BIG_ENDIAN + help + The Unprivileged Load Store Instructions (LSUI) provides + variants load/store instructions that access user-space memory + from the kernel without clearing PSTATE.PAN bit. + + This feature is supported by LLVM 20+ and binutils 2.45+. + +endmenu # "ARMv9.6 architectural feature" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y From f91e913355f49c878fc77f995fd71b7800352bd2 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Fri, 13 Mar 2026 14:45:38 +0000 Subject: [PATCH 041/109] arm_mpam: Ensure in_reset_state is false after applying configuration The per-RIS flag, in_reset_state, indicates whether or not the MSC registers are in reset state, and allows avoiding resetting when they are already in reset state. However, when mpam_apply_config() updates the configuration it doesn't update the in_reset_state flag and so even after the configuration update in_reset_state can be true and mpam_reset_ris() will skip the actual register restoration on subsequent resets. Once resctrl has a MPAM backend it will use resctrl_arch_reset_all_ctrls() to reset the MSC configuration on unmount and, if the in_reset_state flag is bogusly true, fail to reset the MSC configuration. The resulting non-reset MSC configuration can lead to persistent performance restrictions even after resctrl is unmounted. Fix by clearing in_reset_state to false immediately after successful configuration application, ensuring that the next reset operation properly restores MSC register defaults. Fixes: 09b89d2a72f3 ("arm_mpam: Allow configuration to be applied and restored during cpu online") Signed-off-by: Zeng Heng Acked-by: Ben Horgan [Horgan: rewrite commit message to not be specific to resctrl unmount] Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse --- drivers/resctrl/mpam_devices.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 1eebc2602187..0fd6590a9b5c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2692,6 +2692,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, srcu_read_lock_held(&mpam_srcu)) { arg.ris = ris; mpam_touch_msc(msc, __write_config, &arg); + ris->in_reset_state = false; } mutex_unlock(&msc->cfg_lock); } From a1cb6577f575ba5ec2583caf4f791a86754dbf69 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:39 +0000 Subject: [PATCH 042/109] arm_mpam: Reset when feature configuration bit unset To indicate that the configuration, of the controls used by resctrl, in a RIS need resetting to driver defaults the reset flags in mpam_config are set. However, these flags are only ever set temporarily at RIS scope in mpam_reset_ris() and hence mpam_cpu_online() will never reset these controls to default. As the hardware reset is unknown this leads to unknown configuration when the control values haven't been configured away from the defaults. Use the policy that an unset feature configuration bit means reset. In this way the mpam_config in the component can encode that it should be in reset state and mpam_reprogram_msc() will reset controls as needed. Fixes: 09b89d2a72f3 ("arm_mpam: Allow configuration to be applied and restored during cpu online") Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: James Morse Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick [ morse: Removed unused reset flags from config structure ] Signed-off-by: James Morse --- drivers/resctrl/mpam_devices.c | 40 ++++++++++----------------------- drivers/resctrl/mpam_internal.h | 4 ---- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0fd6590a9b5c..ff861291bd4e 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1364,17 +1364,15 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, __mpam_intpart_sel(ris->ris_idx, partid, msc); } - if (mpam_has_feature(mpam_feat_cpor_part, rprops) && - mpam_has_feature(mpam_feat_cpor_part, cfg)) { - if (cfg->reset_cpbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); - else + if (mpam_has_feature(mpam_feat_cpor_part, rprops)) { + if (mpam_has_feature(mpam_feat_cpor_part, cfg)) mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); + else + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); } - if (mpam_has_feature(mpam_feat_mbw_part, rprops) && - mpam_has_feature(mpam_feat_mbw_part, cfg)) { - if (cfg->reset_mbw_pbm) + if (mpam_has_feature(mpam_feat_mbw_part, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_part, cfg)) mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); else mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); @@ -1384,16 +1382,14 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_has_feature(mpam_feat_mbw_min, cfg)) mpam_write_partsel_reg(msc, MBW_MIN, 0); - if (mpam_has_feature(mpam_feat_mbw_max, rprops) && - mpam_has_feature(mpam_feat_mbw_max, cfg)) { - if (cfg->reset_mbw_max) - mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); - else + if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + else + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); } - if (mpam_has_feature(mpam_feat_mbw_prop, rprops) && - mpam_has_feature(mpam_feat_mbw_prop, cfg)) + if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) mpam_write_partsel_reg(msc, MBW_PROP, 0); if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) @@ -1491,16 +1487,6 @@ static int mpam_save_mbwu_state(void *arg) return 0; } -static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) -{ - *reset_cfg = (struct mpam_config) { - .reset_cpbm = true, - .reset_mbw_pbm = true, - .reset_mbw_max = true, - }; - bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); -} - /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. @@ -1508,14 +1494,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) static int mpam_reset_ris(void *arg) { u16 partid, partid_max; - struct mpam_config reset_cfg; + struct mpam_config reset_cfg = {}; struct mpam_msc_ris *ris = arg; if (ris->in_reset_state) return 0; - mpam_init_reset_cfg(&reset_cfg); - spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e8971842b124..7af762c98efc 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -266,10 +266,6 @@ struct mpam_config { u32 mbw_pbm; u16 mbw_max; - bool reset_cpbm; - bool reset_mbw_pbm; - bool reset_mbw_max; - struct mpam_garbage garbage; }; From 29fa1be82b83f87e603ed4c21fe86c6e05fd0282 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:40 +0000 Subject: [PATCH 043/109] arm64/sysreg: Add MPAMSM_EL1 register The MPAMSM_EL1 register determines the MPAM configuration for an SMCU. Add the register definition. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/tools/sysreg | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 9d1c21108057..1287cb1de6f3 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -5172,6 +5172,14 @@ Field 31:16 PARTID_D Field 15:0 PARTID_I EndSysreg +Sysreg MPAMSM_EL1 3 0 10 5 3 +Res0 63:48 +Field 47:40 PMG_D +Res0 39:32 +Field 31:16 PARTID_D +Res0 15:0 +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS From eda1cd1f9d29b382a07d757cf8b29f9ee636355f Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:41 +0000 Subject: [PATCH 044/109] KVM: arm64: Preserve host MPAM configuration when changing traps When KVM enables or disables MPAM traps to EL2 it clears all other bits in MPAM2_EL2. Notably, it clears the partition ids (PARTIDs) and performance monitoring groups (PMGs). Avoid changing these bits in anticipation of adding support for MPAM in the kernel. Otherwise, on a VHE system with the host running at EL2 where MPAM2_EL2 and MPAM1_EL1 access the same register, any attempt to use MPAM to monitor or partition resources for kernel space would be foiled by running a KVM guest. Additionally, MPAM2_EL2.EnMPAMSM is always set to 0 which causes MPAMSM_EL1 to always trap. Keep EnMPAMSM set to 1 when not in a guest so that the kernel can use MPAMSM_EL1. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/kvm/hyp/include/hyp/switch.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 2597e8bda867..0b50ddd530f3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -267,7 +267,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) { - u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + u64 clr = MPAM2_EL2_EnMPAMSM; + u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; if (!system_supports_mpam()) return; @@ -277,18 +278,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); } else { /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ - r |= MPAM2_EL2_TIDR; + set |= MPAM2_EL2_TIDR; } - write_sysreg_s(r, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); } static inline void __deactivate_traps_mpam(void) { + u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR; + u64 set = MPAM2_EL2_EnMPAMSM; + if (!system_supports_mpam()) return; - write_sysreg_s(0, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); if (system_supports_mpam_hcr()) write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); From 2e7c684bdb50cfaf98da80ebaab4a961fdcd1aa2 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:42 +0000 Subject: [PATCH 045/109] KVM: arm64: Make MPAMSM_EL1 accesses UNDEF The MPAMSM_EL1 register controls the MPAM labeling for an SMCU, Streaming Mode Compute Unit. As there is no MPAM support in KVM, make sure MPAMSM_EL1 accesses trigger an UNDEF. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/kvm/sys_regs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1b4cacb6e918..0edd655934a9 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3376,6 +3376,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MPAM1_EL1), undef_access }, { SYS_DESC(SYS_MPAM0_EL1), undef_access }, + { SYS_DESC(SYS_MPAMSM_EL1), undef_access }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, From 8e06d04ff1cf764066c62e5677bfb0b0c1d1fbbc Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:43 +0000 Subject: [PATCH 046/109] arm64: mpam: Context switch the MPAM registers MPAM allows traffic in the SoC to be labeled by the OS, these labels are used to apply policy in caches and bandwidth regulators, and to monitor traffic in the SoC. The label is made up of a PARTID and PMG value. The x86 equivalent calls these CLOSID and RMID, but they don't map precisely. MPAM has two CPU system registers that is used to hold the PARTID and PMG values that traffic generated at each exception level will use. These can be set per-task by the resctrl file system. (resctrl is the defacto interface for controlling this stuff). Add a helper to switch this. struct task_struct's separate CLOSID and RMID fields are insufficient to implement resctrl using MPAM, as resctrl can change the PARTID (CLOSID) and PMG (sort of like the RMID) separately. On x86, the rmid is an independent number, so a race that writes a mismatched closid and rmid into hardware is benign. On arm64, the pmg bits extend the partid. (i.e. partid-5 has a pmg-0 that is not the same as partid-6's pmg-0). In this case, mismatching the values will 'dirty' a pmg value that resctrl believes is clean, and is not tracking with its 'limbo' code. To avoid this, the partid and pmg are always read and written as a pair. This requires a new u64 field. In struct task_struct there are two u32, rmid and closid for the x86 case, but as we can't use them here do something else. Add this new field, mpam_partid_pmg, to struct thread_info to avoid adding more architecture specific code to struct task_struct. Always use READ_ONCE()/WRITE_ONCE() when accessing this field. Resctrl allows a per-cpu 'default' value to be set, this overrides the values when scheduling a task in the default control-group, which has PARTID 0. The way 'code data prioritisation' gets emulated means the register value for the default group needs to be a variable. The current system register value is kept in a per-cpu variable to avoid writing to the system register if the value isn't going to change. Writes to this register may reset the hardware state for regulating bandwidth. Finally, there is no reason to context switch these registers unless there is a driver changing the values in struct task_struct. Hide the whole thing behind a static key. This also allows the driver to disable MPAM in response to errors reported by hardware. Move the existing static key to belong to the arch code, as in the future the MPAM driver may become a loadable module. All this should depend on whether there is an MPAM driver, hide it behind CONFIG_ARM64_MPAM. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick CC: Amit Singh Tomar Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/Kconfig | 2 + arch/arm64/include/asm/mpam.h | 67 ++++++++++++++++++++++++++++ arch/arm64/include/asm/thread_info.h | 3 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/mpam.c | 13 ++++++ arch/arm64/kernel/process.c | 7 +++ drivers/resctrl/mpam_devices.c | 2 - drivers/resctrl/mpam_internal.h | 4 +- 8 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/mpam.h create mode 100644 arch/arm64/kernel/mpam.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 38dba5f7e4d2..ecaaca13a969 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2039,6 +2039,8 @@ config ARM64_MPAM MPAM is exposed to user-space via the resctrl pseudo filesystem. + This option enables the extra context switch code. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h new file mode 100644 index 000000000000..0747e0526927 --- /dev/null +++ b/arch/arm64/include/asm/mpam.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __ASM__MPAM_H +#define __ASM__MPAM_H + +#include +#include +#include + +#include + +DECLARE_STATIC_KEY_FALSE(mpam_enabled); +DECLARE_PER_CPU(u64, arm64_mpam_default); +DECLARE_PER_CPU(u64, arm64_mpam_current); + +/* + * The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group. + * This is used by the context switch code to use the resctrl CPU property + * instead. The value is modified when CDP is enabled/disabled by mounting + * the resctrl filesystem. + */ +extern u64 arm64_mpam_global_default; + +/* + * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, + * which may race with reads in mpam_thread_switch(). Ensure only one of the old + * or new values are used. Particular care should be taken with the pmg field as + * mpam_thread_switch() may read a partid and pmg that don't match, causing this + * value to be stored with cache allocations, despite being considered 'free' by + * resctrl. + */ +#ifdef CONFIG_ARM64_MPAM +static inline u64 mpam_get_regval(struct task_struct *tsk) +{ + return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); +} + +static inline void mpam_thread_switch(struct task_struct *tsk) +{ + u64 oldregval; + int cpu = smp_processor_id(); + u64 regval = mpam_get_regval(tsk); + + if (!static_branch_likely(&mpam_enabled)) + return; + + if (regval == READ_ONCE(arm64_mpam_global_default)) + regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu)); + + oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + if (oldregval == regval) + return; + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); + + WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval); +} +#else +static inline void mpam_thread_switch(struct task_struct *tsk) {} +#endif /* CONFIG_ARM64_MPAM */ + +#endif /* __ASM__MPAM_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 7942478e4065..5d7fe3e153c8 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,9 @@ struct thread_info { #ifdef CONFIG_SHADOW_CALL_STACK void *scs_base; void *scs_sp; +#endif +#ifdef CONFIG_ARM64_MPAM + u64 mpam_partid_pmg; #endif u32 cpu; }; diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 76f32e424065..15979f366519 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MPAM) += mpam.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c new file mode 100644 index 000000000000..9866d2ca0faa --- /dev/null +++ b/arch/arm64/kernel/mpam.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Arm Ltd. */ + +#include + +#include +#include + +DEFINE_STATIC_KEY_FALSE(mpam_enabled); +DEFINE_PER_CPU(u64, arm64_mpam_default); +DEFINE_PER_CPU(u64, arm64_mpam_current); + +u64 arm64_mpam_global_default; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 489554931231..47698955fa1e 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -738,6 +739,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + /* + * MPAM thread switch happens after the DSB to ensure prev's accesses + * use prev's MPAM settings. + */ + mpam_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index ff861291bd4e..d50461d6ff3f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,8 +29,6 @@ #include "mpam_internal.h" -DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */ - /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 7af762c98efc..a13fb9880ced 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -16,12 +16,12 @@ #include #include +#include + #define MPAM_MSC_MAX_NUM_RIS 16 struct platform_device; -DECLARE_STATIC_KEY_FALSE(mpam_enabled); - #ifdef CONFIG_MPAM_KUNIT_TEST #define PACKED_FOR_KUNIT __packed #else From 87b78a5d70e83d4dbe31e1afda2be736a3330b31 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:44 +0000 Subject: [PATCH 047/109] arm64: mpam: Re-initialise MPAM regs when CPU comes online Now that the MPAM system registers are expected to have values that change, reprogram them based on the previous value when a CPU is brought online. Previously MPAM's 'default PARTID' of 0 was always used for MPAM in kernel-space as this is the PARTID that hardware guarantees to reset. Because there are a limited number of PARTID, this value is exposed to user-space, meaning resctrl changes to the resctrl default group would also affect kernel threads. Instead, use the task's PARTID value for kernel work on behalf of user-space too. The default of 0 is kept for both user-space and kernel-space when MPAM is not enabled. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/kernel/cpufeature.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c31f8e17732a..c3f900f81653 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -2492,13 +2493,17 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { - /* - * Access by the kernel (at EL1) should use the reserved PARTID - * which is configured unrestricted. This avoids priority-inversion - * where latency sensitive tasks have to wait for a task that has - * been throttled to release the lock. - */ - write_sysreg_s(0, SYS_MPAM1_EL1); + int cpu = smp_processor_id(); + u64 regval = 0; + + if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled)) + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); } static bool From c544f00a473239835d22e7109b403314d8b85974 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:45 +0000 Subject: [PATCH 048/109] arm64: mpam: Drop the CONFIG_EXPERT restriction In anticipation of MPAM being useful remove the CONFIG_EXPERT restriction. This was done to prevent the driver being enabled before the user-space interface was wired up. Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Ben Horgan [ morse: Added second paragraph ] Signed-off-by: James Morse --- arch/arm64/Kconfig | 2 +- drivers/resctrl/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ecaaca13a969..3170c67464fb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2016,7 +2016,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" - select ARM64_MPAM_DRIVER if EXPERT # does nothing yet + select ARM64_MPAM_DRIVER select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) is an diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index c808e0470394..c34e059c6e41 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,6 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" - depends on ARM64 && ARM64_MPAM && EXPERT + depends on ARM64 && ARM64_MPAM help Memory System Resource Partitioning and Monitoring (MPAM) driver for System IP, e.g. caches and memory controllers. From 831a7f16728c5ceef04ab99a699c3d9e519dc4b8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:46 +0000 Subject: [PATCH 049/109] arm64: mpam: Advertise the CPUs MPAM limits to the driver Requesters need to populate the MPAM fields for any traffic they send on the interconnect. For the CPUs these values are taken from the corresponding MPAMy_ELx register. Each requester may have a limit on the largest PARTID or PMG value that can be used. The MPAM driver has to determine the system-wide minimum supported PARTID and PMG values. To do this, the driver needs to be told what each requestor's limit is. CPUs are special, but this infrastructure is also needed for the SMMU and GIC ITS. Call the helper to tell the MPAM driver what the CPUs can do. The return value can be ignored by the arch code as it runs well before the MPAM driver starts probing. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan [ morse: requestor->requester as argued by ispell ] Signed-off-by: James Morse --- arch/arm64/kernel/mpam.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 9866d2ca0faa..e6feff2324ac 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -3,6 +3,7 @@ #include +#include #include #include @@ -11,3 +12,14 @@ DEFINE_PER_CPU(u64, arm64_mpam_default); DEFINE_PER_CPU(u64, arm64_mpam_current); u64 arm64_mpam_global_default; + +static int __init arm64_mpam_register_cpus(void) +{ + u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); + u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + + return mpam_register_requestor(partid_max, pmg_max); +} +/* Must occur before mpam_msc_driver_init() from subsys_initcall() */ +arch_initcall(arm64_mpam_register_cpus) From 735dad999905dfd246be1994bb8d203063aeb0d6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:47 +0000 Subject: [PATCH 050/109] arm64: mpam: Add cpu_pm notifier to restore MPAM sysregs The MPAM system registers will be lost if the CPU is reset during PSCI's CPU_SUSPEND. Add a PM notifier to restore them. mpam_thread_switch(current) can't be used as this won't make any changes if the in-memory copy says the register already has the correct value. In reality the system register is UNKNOWN out of reset. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/kernel/mpam.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index e6feff2324ac..48ec0ffd5999 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,12 +14,44 @@ DEFINE_PER_CPU(u64, arm64_mpam_current); u64 arm64_mpam_global_default; +static int mpam_pm_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + u64 regval; + int cpu = smp_processor_id(); + + switch (cmd) { + case CPU_PM_EXIT: + /* + * Don't use mpam_thread_switch() as the system register + * value has changed under our feet. + */ + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + write_sysreg_s(regval, SYS_MPAM0_EL1); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block mpam_pm_nb = { + .notifier_call = mpam_pm_notifier, +}; + static int __init arm64_mpam_register_cpus(void) { u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + if (!system_supports_mpam()) + return 0; + + cpu_pm_register_notifier(&mpam_pm_nb); return mpam_register_requestor(partid_max, pmg_max); } /* Must occur before mpam_msc_driver_init() from subsys_initcall() */ From 37fe0f984d9ca60e8d95fc9a85d37f4300159625 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:48 +0000 Subject: [PATCH 051/109] arm64: mpam: Initialise and context switch the MPAMSM_EL1 register The MPAMSM_EL1 sets the MPAM labels, PMG and PARTID, for loads and stores generated by a shared SMCU. Disable the traps so the kernel can use it and set it to the same configuration as the per-EL cpu MPAM configuration. If an SMCU is not shared with other cpus then it is implementation defined whether the configuration from MPAMSM_EL1 is used or that from the appropriate MPAMy_ELx. As we set the same, PMG_D and PARTID_D, configuration for MPAM0_EL1, MPAM1_EL1 and MPAMSM_EL1 the resulting configuration is the same regardless. The range of valid configurations for the PARTID and PMG in MPAMSM_EL1 is not currently specified in Arm Architectural Reference Manual but the architect has confirmed that it is intended to be the same as that for the cpu configuration in the MPAMy_ELx registers. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Reviewed-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/include/asm/el2_setup.h | 3 ++- arch/arm64/include/asm/mpam.h | 2 ++ arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/mpam.c | 4 ++++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 85f4c1615472..4d15071a4f3f 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -513,7 +513,8 @@ check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 .Linit_mpam_\@: - msr_s SYS_MPAM2_EL2, xzr // use the default partition + mov x0, #MPAM2_EL2_EnMPAMSM_MASK + msr_s SYS_MPAM2_EL2, x0 // use the default partition, // and disable lower traps mrs_s x0, SYS_MPAMIDR_EL1 tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 0747e0526927..6bccbfdccb87 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -53,6 +53,8 @@ static inline void mpam_thread_switch(struct task_struct *tsk) return; write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); isb(); /* Synchronising the EL0 write is left until the ERET to EL0 */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c3f900f81653..4f34e7a76f64 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2500,6 +2500,8 @@ cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (cpus_have_cap(ARM64_SME)) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); isb(); /* Synchronising the EL0 write is left until the ERET to EL0 */ diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 48ec0ffd5999..3a490de4fa12 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -28,6 +28,10 @@ static int mpam_pm_notifier(struct notifier_block *self, */ regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) { + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), + SYS_MPAMSM_EL1); + } isb(); write_sysreg_s(regval, SYS_MPAM0_EL1); From 2cf9ca3fae38b7894e7f1435cec92f9a679b42f9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:49 +0000 Subject: [PATCH 052/109] arm64: mpam: Add helpers to change a task or cpu's MPAM PARTID/PMG values Care must be taken when modifying the PARTID and PMG of a task in any per-task structure as writing these values may race with the task being scheduled in, and reading the modified values. Add helpers to set the task properties, and the CPU default value. These use WRITE_ONCE() that pairs with the READ_ONCE() in mpam_get_regval() to avoid causing torn values. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Cc: Dave Martin Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/include/asm/mpam.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 6bccbfdccb87..05aa71200f61 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,7 @@ #ifndef __ASM__MPAM_H #define __ASM__MPAM_H +#include #include #include #include @@ -22,6 +23,23 @@ DECLARE_PER_CPU(u64, arm64_mpam_current); */ extern u64 arm64_mpam_global_default; +#ifdef CONFIG_ARM64_MPAM +static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i) +{ + return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) | + FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) | + FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) | + FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i); +} + +static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val); +} + /* * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, * which may race with reads in mpam_thread_switch(). Ensure only one of the old @@ -30,12 +48,20 @@ extern u64 arm64_mpam_global_default; * value to be stored with cache allocations, despite being considered 'free' by * resctrl. */ -#ifdef CONFIG_ARM64_MPAM static inline u64 mpam_get_regval(struct task_struct *tsk) { return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); } +static inline void mpam_set_task_partid_pmg(struct task_struct *tsk, + u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval); +} + static inline void mpam_thread_switch(struct task_struct *tsk) { u64 oldregval; From 67faed4ccb4f4b8055f50e9cc5670f1fc09ea287 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:50 +0000 Subject: [PATCH 053/109] KVM: arm64: Force guest EL1 to use user-space's partid configuration While we trap the guest's attempts to read/write the MPAM control registers, the hardware continues to use them. Guest-EL0 uses KVM's user-space's configuration, as the value is left in the register, and guest-EL1 uses either the host kernel's configuration, or in the case of VHE, the UNKNOWN reset value of MPAM1_EL1. We want to force the guest-EL1 to use KVM's user-space's MPAM configuration. On nVHE rely on MPAM0_EL1 and MPAM1_EL1 always being programmed the same and on VHE copy MPAM0_EL1 into the guest's MPAM1_EL1. There is no need to restore as this is out of context once TGE is set. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index b254d442e54e..be685b63e8cf 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -183,6 +183,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); +/* + * The _EL0 value was written by the host's context switch and belongs to the + * VMM. Copy this into the guest's _EL1 register. + */ +static inline void __mpam_guest_load(void) +{ + u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I; + + if (system_supports_mpam()) { + u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN; + + write_sysreg_el1(val, SYS_MPAM1); + } +} + /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * @@ -222,6 +237,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); + __mpam_guest_load(); if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); From 09e61daf8e96b9bdb04dd112bdecf9382fd3f919 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:51 +0000 Subject: [PATCH 054/109] arm_mpam: resctrl: Add boilerplate cpuhp and domain allocation resctrl has its own data structures to describe its resources. We can't use these directly as we play tricks with the 'MBA' resource, picking the MPAM controls or monitors that best apply. We may export the same component as both L3 and MBA. Add mpam_resctrl_res[] as the array of class->resctrl mappings we are exporting, and add the cpuhp hooks that allocated and free the resctrl domain structures. Only the mpam control feature are considered here and monitor support will be added later. While we're here, plumb in a few other obvious things. CONFIG_ARM_CPU_RESCTRL is used to allow this code to be built even though it can't yet be linked against resctrl. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/Makefile | 1 + drivers/resctrl/mpam_devices.c | 12 ++ drivers/resctrl/mpam_internal.h | 21 +++ drivers/resctrl/mpam_resctrl.c | 324 ++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 3 + 5 files changed, 361 insertions(+) create mode 100644 drivers/resctrl/mpam_resctrl.c diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 898199dcf80d..40beaf999582 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o +mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index d50461d6ff3f..0e525539b7e2 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1612,6 +1612,9 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } + if (mpam_is_enabled()) + return mpam_resctrl_online_cpu(cpu); + return 0; } @@ -1655,6 +1658,9 @@ static int mpam_cpu_offline(unsigned int cpu) { struct mpam_msc *msc; + if (mpam_is_enabled()) + mpam_resctrl_offline_cpu(cpu); + guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, srcu_read_lock_held(&mpam_srcu)) { @@ -2500,6 +2506,12 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + if (!err) { + err = mpam_resctrl_setup(); + if (err) + pr_err("Failed to initialise resctrl: %d\n", err); + } + if (err) { mpam_disable_reason = "Failed to enable."; schedule_work(&mpam_broken_work); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index a13fb9880ced..43c8e0f5f7ac 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -333,6 +334,16 @@ struct mpam_msc_ris { struct mpam_garbage garbage; }; +struct mpam_resctrl_dom { + struct mpam_component *ctrl_comp; + struct rdt_ctrl_domain resctrl_ctrl_dom; +}; + +struct mpam_resctrl_res { + struct mpam_class *class; + struct rdt_resource resctrl_res; +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -387,6 +398,16 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +#ifdef CONFIG_RESCTRL_FS +int mpam_resctrl_setup(void); +int mpam_resctrl_online_cpu(unsigned int cpu); +void mpam_resctrl_offline_cpu(unsigned int cpu); +#else +static inline int mpam_resctrl_setup(void) { return 0; } +static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +#endif /* CONFIG_RESCTRL_FS */ + /* * MPAM MSCs have the following register layout. See: * Arm Memory System Resource Partitioning and Monitoring (MPAM) System diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c new file mode 100644 index 000000000000..9a3070970414 --- /dev/null +++ b/drivers/resctrl/mpam_resctrl.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_internal.h" + +/* + * The classes we've picked to map to resctrl resources, wrapped + * in with their resctrl structure. + * Class pointer may be NULL. + */ +static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; + +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + +/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ +static DEFINE_MUTEX(domain_list_lock); + +bool resctrl_arch_alloc_capable(void) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + for_each_mpam_resctrl_control(res, rid) { + if (res->resctrl_res.alloc_capable) + return true; + } + + return false; +} + +/* + * MSC may raise an error interrupt if it sees an out or range partid/pmg, + * and go on to truncate the value. Regardless of what the hardware supports, + * only the system wide safe value is safe to use. + */ +u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) +{ + return mpam_partid_max + 1; +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &mpam_resctrl_controls[l].resctrl_res; +} + +static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) +{ + /* TODO: initialise the resctrl resources */ + + return 0; +} + +static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + if (class->type == MPAM_CLASS_CACHE) + return comp->comp_id; + + /* TODO: repaint domain ids to match the L3 domain ids */ + /* Otherwise, expose the ID used by the firmware table code. */ + return comp->comp_id; +} + +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + enum resctrl_res_level rid, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + INIT_LIST_HEAD(&hdr->list); + hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); + hdr->rid = rid; + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +static void mpam_resctrl_online_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +/** + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. + * @hdr: The domain's header. + * + * Removes @cpu from the header mask. If this was the last CPU in the domain, + * the domain header is removed from its parent list and true is returned, + * indicating the parent structure can be freed. + * If there are other CPUs in the domain, returns false. + */ +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_held(&domain_list_lock); + + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (cpumask_empty(&hdr->cpu_mask)) { + list_del_rcu(&hdr->list); + synchronize_rcu(); + return true; + } + + return false; +} + +static void mpam_resctrl_domain_insert(struct list_head *list, + struct rdt_domain_hdr *new) +{ + struct rdt_domain_hdr *err; + struct list_head *pos = NULL; + + lockdep_assert_held(&domain_list_lock); + + err = resctrl_find_domain(list, new->id, &pos); + if (WARN_ON_ONCE(err)) + return; + + list_add_tail_rcu(&new->list, pos); +} + +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +{ + int err; + struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + struct mpam_class *class = res->class; + struct mpam_component *comp_iter, *ctrl_comp; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_held(&domain_list_lock); + + ctrl_comp = NULL; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + + /* class has no component for this CPU */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!dom) + return ERR_PTR(-ENOMEM); + + if (r->alloc_capable) { + dom->ctrl_comp = ctrl_comp; + + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); + ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; + err = resctrl_online_ctrl_domain(r, ctrl_d); + if (err) + goto free_domain; + + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); + } else { + pr_debug("Skipped control domain online - no controls\n"); + } + return dom; + +free_domain: + kfree(dom); + dom = ERR_PTR(err); + + return dom; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) + return dom; + } + + return NULL; +} + +int mpam_resctrl_online_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy_resource; + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res); + if (IS_ERR(dom)) + return PTR_ERR(dom); + } else { + if (r->alloc_capable) { + struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; + + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + } + } + + resctrl_online_cpu(cpu); + + return 0; +} + +void mpam_resctrl_offline_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + resctrl_offline_cpu(cpu); + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy resource + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (WARN_ON_ONCE(!dom)) + continue; + + if (r->alloc_capable) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } else { + ctrl_dom_empty = true; + } + + if (ctrl_dom_empty) + kfree(dom); + } +} + +int mpam_resctrl_setup(void) +{ + int err = 0; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + cpus_read_lock(); + for_each_mpam_resctrl_control(res, rid) { + INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + res->resctrl_res.rid = rid; + } + + /* TODO: pick MPAM classes to map to resctrl resources */ + + /* Initialise the resctrl structures from the classes */ + for_each_mpam_resctrl_control(res, rid) { + if (!res->class) + continue; // dummy resource + + err = mpam_resctrl_control_init(res); + if (err) { + pr_debug("Failed to initialise rid %u\n", rid); + break; + } + } + cpus_read_unlock(); + + if (err) { + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; + } + + if (!resctrl_arch_alloc_capable()) { + pr_debug("No alloc(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable()); + return -EOPNOTSUPP; + } + + /* TODO: call resctrl_init() */ + + return 0; +} diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7f00c5285a32..2c7d1413a401 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -49,6 +49,9 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, } #endif +bool resctrl_arch_alloc_capable(void); +bool resctrl_arch_mon_capable(void); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 52a4edb16121d07734e4e392767d26d286f08c35 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:52 +0000 Subject: [PATCH 055/109] arm_mpam: resctrl: Pick the caches we will use as resctrl resources Systems with MPAM support may have a variety of control types at any point of their system layout. We can only expose certain types of control, and only if they exist at particular locations. Start with the well-known caches. These have to be depth 2 or 3 and support MPAM's cache portion bitmap controls, with a number of portions fewer than resctrl's limit. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 91 +++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9a3070970414..65bb670dc3fb 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -65,9 +65,95 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static bool cache_has_usable_cpor(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_cpor_part, cprops)) + return false; + + /* resctrl uses u32 for all bitmap configurations */ + return class->props.cpbm_wd <= 32; +} + +/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ +static void mpam_resctrl_pick_caches(void) +{ + struct mpam_class *class; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + if (class->type != MPAM_CLASS_CACHE) { + pr_debug("class %u is not a cache\n", class->level); + continue; + } + + if (class->level != 2 && class->level != 3) { + pr_debug("class %u is not L2 or L3\n", class->level); + continue; + } + + if (!cache_has_usable_cpor(class)) { + pr_debug("class %u cache misses CPOR\n", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level, + cpumask_pr_args(&class->affinity), + cpumask_pr_args(cpu_possible_mask)); + continue; + } + + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { - /* TODO: initialise the resctrl resources */ + struct mpam_class *class = res->class; + struct rdt_resource *r = &res->resctrl_res; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + r->schema_fmt = RESCTRL_SCHEMA_BITMAP; + r->cache.arch_has_sparse_bitmasks = true; + + r->cache.cbm_len = class->props.cpbm_wd; + /* mpam_devices will reject empty bitmaps */ + r->cache.min_cbm_bits = 1; + + if (r->rid == RDT_RESOURCE_L2) { + r->name = "L2"; + r->ctrl_scope = RESCTRL_L2_CACHE; + r->cdp_capable = true; + } else { + r->name = "L3"; + r->ctrl_scope = RESCTRL_L3_CACHE; + r->cdp_capable = true; + } + + /* + * Which bits are shared with other ...things... Unknown + * devices use partid-0 which uses all the bitmap fields. Until + * we have configured the SMMU and GIC not to do this 'all the + * bits' is the correct answer here. + */ + r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->alloc_capable = true; + break; + default: + return -EINVAL; + } return 0; } @@ -292,7 +378,8 @@ int mpam_resctrl_setup(void) res->resctrl_res.rid = rid; } - /* TODO: pick MPAM classes to map to resctrl resources */ + /* Find some classes to use for controls */ + mpam_resctrl_pick_caches(); /* Initialise the resctrl structures from the classes */ for_each_mpam_resctrl_control(res, rid) { From 370d166d878d0c0aa06568d67387a1151a200501 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:53 +0000 Subject: [PATCH 056/109] arm_mpam: resctrl: Implement resctrl_arch_reset_all_ctrls() We already have a helper for resetting an mpam class and component. Hook it up to resctrl_arch_reset_all_ctrls() and the domain offline path. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Shaopeng Tan Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_devices.c | 2 +- drivers/resctrl/mpam_internal.h | 3 +++ drivers/resctrl/mpam_resctrl.c | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0e525539b7e2..0e5e24ef60fe 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2551,7 +2551,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp) } } -static void mpam_reset_class_locked(struct mpam_class *class) +void mpam_reset_class_locked(struct mpam_class *class) { struct mpam_component *comp; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 43c8e0f5f7ac..f063a741aaba 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -388,6 +388,9 @@ extern u8 mpam_pmg_max; void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); +/* Reset all the RIS in a class under cpus_read_lock() */ +void mpam_reset_class_locked(struct mpam_class *class); + int mpam_apply_config(struct mpam_component *comp, u16 partid, struct mpam_config *cfg); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 65bb670dc3fb..b2217d11561d 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -170,6 +170,19 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) +{ + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + mpam_reset_class_locked(res->class); +} + static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, enum resctrl_res_level rid, struct rdt_domain_hdr *hdr) From 02cc661687886563a0e08ecee51c5ef7d1737237 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:54 +0000 Subject: [PATCH 057/109] arm_mpam: resctrl: Add resctrl_arch_get_config() Implement resctrl_arch_get_config() by testing the live configuration for a CPOR bitmap. For any other configuration type return the default. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index b2217d11561d..3af57b6f2c1b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -170,6 +170,49 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + u32 partid; + struct mpam_config *cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + enum mpam_device_features configured_by; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return resctrl_get_default_ctrl(r); + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, type); + cfg = &dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + configured_by = mpam_feat_cpor_part; + break; + default: + return resctrl_get_default_ctrl(r); + } + + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || + !mpam_has_feature(configured_by, cfg)) + return resctrl_get_default_ctrl(r); + + switch (configured_by) { + case mpam_feat_cpor_part: + return cfg->cpbm; + default: + return resctrl_get_default_ctrl(r); + } +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; From 9cd2b522be2cc64fab179d75537d2e8df38d26a6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:55 +0000 Subject: [PATCH 058/109] arm_mpam: resctrl: Implement helpers to update configuration resctrl has two helpers for updating the configuration. resctrl_arch_update_one() updates a single value, and is used by the software-controller to apply feedback to the bandwidth controls, it has to be called on one of the CPUs in the resctrl:domain. resctrl_arch_update_domains() copies multiple staged configurations, it can be called from anywhere. Both helpers should update any changes to the underlying hardware. Implement resctrl_arch_update_domains() to use resctrl_arch_update_one(). Neither need to be called on a specific CPU as the mpam driver will send IPIs as needed. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 3af57b6f2c1b..ea60777934ff 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -213,6 +213,76 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, } } +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + u32 partid; + struct mpam_config cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + /* + * No need to check the CPU as mpam_apply_config() doesn't care, and + * resctrl_arch_update_domains() relies on this. + */ + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, t); + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { + pr_debug("Not alloc capable or computed PARTID out of range\n"); + return -EINVAL; + } + + /* + * Copy the current config to avoid clearing other resources when the + * same component is exposed multiple times through resctrl. + */ + cfg = dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + cfg.cpbm = cfg_val; + mpam_set_feature(mpam_feat_cpor_part, &cfg); + break; + default: + return -EINVAL; + } + + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + int err; + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { + for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { + struct resctrl_staged_config *cfg = &d->staged_config[t]; + + if (!cfg->have_new_ctrl) + continue; + + err = resctrl_arch_update_one(r, d, closid, t, + cfg->new_ctrl); + if (err) + return err; + } + } + + return 0; +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; From 9d2e1a99fae58ce992f147bdf83b5d9089f70b27 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:56 +0000 Subject: [PATCH 059/109] arm_mpam: resctrl: Add plumbing against arm64 task and cpu hooks arm64 provides helpers for changing a task's and a cpu's mpam partid/pmg values. These are used to back a number of resctrl_arch_ functions. Connect them up. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 58 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 2 files changed, 63 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index ea60777934ff..9cde5b7e644c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,8 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); +static bool cdp_enabled; + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -57,6 +60,61 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +void resctrl_arch_sched_in(struct task_struct *tsk) +{ + lockdep_assert_preemption_disabled(); + + mpam_thread_switch(tsk); +} + +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid); + } else { + /* + * When CDP is enabled, resctrl halves the closid range and we + * use odd/even partid for one closid. + */ + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid); + } +} + +void resctrl_arch_sync_cpu_closid_rmid(void *info) +{ + struct resctrl_cpu_defaults *r = info; + + lockdep_assert_preemption_disabled(); + + if (r) { + resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), + r->closid, r->rmid); + } + + resctrl_arch_sched_in(current); +} + +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid); + } else { + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid); + } +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 2c7d1413a401..5a78299ec464 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -52,6 +52,11 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); +void resctrl_arch_sched_in(struct task_struct *tsk); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 6789fb99282c0a8e8e84701b7edf456f4a9e71e2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:57 +0000 Subject: [PATCH 060/109] arm_mpam: resctrl: Add CDP emulation Intel RDT's CDP feature allows the cache to use a different control value depending on whether the accesses was for instruction fetch or a data access. MPAM's equivalent feature is the other way up: the CPU assigns a different partid label to traffic depending on whether it was instruction fetch or a data access, which causes the cache to use a different control value based solely on the partid. MPAM can emulate CDP, with the side effect that the alternative partid is seen by all MSC, it can't be enabled per-MSC. Add the resctrl hooks to turn this on or off. Add the helpers that match a closid against a task, which need to be aware that the value written to hardware is not the same as the one resctrl is using. Update the 'arm64_mpam_global_default' variable the arch code uses during context switch to know when the per-cpu value should be used instead. Also, update these per-cpu values and sync the resulting mpam partid/pmg configuration to hardware. resctrl can enable CDP for L2 caches, L3 caches or both. When it is enabled by one and not the other MPAM globally enabled CDP but hides the effect on the other cache resource. This hiding is possible as CPOR is the only supported cache control and that uses a resource bitmap; two partids with the same bitmap act as one. Awkwardly, the MB controls don't implement CDP and CDP can't be hidden as the memory bandwidth control is a maximum per partid which can't be modelled with more partids. If the total maximum is used for both the data and instruction partids then then the maximum may be exceeded and if it is split in two then the one using more bandwidth will hit a lower limit. Hence, hide the MB controls completely if CDP is enabled for any resource. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Cc: Dave Martin Cc: Amit Singh Tomar Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/include/asm/mpam.h | 1 + drivers/resctrl/mpam_internal.h | 1 + drivers/resctrl/mpam_resctrl.c | 122 ++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 2 + 4 files changed, 126 insertions(+) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 05aa71200f61..70d396e7b6da 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,7 @@ #ifndef __ASM__MPAM_H #define __ASM__MPAM_H +#include #include #include #include diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index f063a741aaba..2751eeaba302 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -342,6 +342,7 @@ struct mpam_resctrl_dom { struct mpam_resctrl_res { struct mpam_class *class; struct rdt_resource resctrl_res; + bool cdp_enabled; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9cde5b7e644c..2111542f485e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -35,6 +35,10 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); +/* + * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. + * This applies globally to all traffic the CPU generates. + */ static bool cdp_enabled; bool resctrl_arch_alloc_capable(void) @@ -50,6 +54,74 @@ bool resctrl_arch_alloc_capable(void) return false; } +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) +{ + return mpam_resctrl_controls[rid].cdp_enabled; +} + +/** + * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks. + * + * At boot, all existing tasks use partid zero for D and I. + * To enable/disable CDP emulation, all these tasks need relabelling. + */ +static void resctrl_reset_task_closids(void) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + } + read_unlock(&tasklist_lock); +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) +{ + u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + int cpu; + + /* + * resctrl_arch_set_cdp_enabled() is only called with enable set to + * false on error and unmount. + */ + cdp_enabled = enable; + mpam_resctrl_controls[rid].cdp_enabled = enable; + + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ + if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; + + if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled && + mpam_resctrl_controls[RDT_RESOURCE_MBA].class) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; + + if (enable) { + if (mpam_partid_max < 1) + return -EINVAL; + + partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA); + partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE); + } + + mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0); + WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); + + resctrl_reset_task_closids(); + + for_each_possible_cpu(cpu) + mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); + on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1); + + return 0; +} + +static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid) +{ + return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid); +} + /* * MSC may raise an error interrupt if it sees an out or range partid/pmg, * and go on to truncate the value. Regardless of what the hardware supports, @@ -115,6 +187,30 @@ void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) } } +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return tsk_closid == closid; +} + +/* The task's pmg is not unique, the partid must be considered too */ +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return (tsk_closid == closid) && (tsk_rmid == rmid); +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) @@ -247,6 +343,14 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; + /* + * When CDP is enabled, but the resource doesn't support it, + * the control is cloned across both partids. + * Pick one at random to read: + */ + if (mpam_resctrl_hide_cdp(r->rid)) + type = CDP_DATA; + partid = resctrl_get_config_index(closid, type); cfg = &dom->ctrl_comp->cfg[partid]; @@ -274,6 +378,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { + int err; u32 partid; struct mpam_config cfg; struct mpam_props *cprops; @@ -291,6 +396,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; + if (mpam_resctrl_hide_cdp(r->rid)) + t = CDP_DATA; + partid = resctrl_get_config_index(closid, t); if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { pr_debug("Not alloc capable or computed PARTID out of range\n"); @@ -313,6 +421,20 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, return -EINVAL; } + /* + * When CDP is enabled, but the resource doesn't support it, we need to + * apply the same configuration to the other partid. + */ + if (mpam_resctrl_hide_cdp(r->rid)) { + partid = resctrl_get_config_index(closid, CDP_CODE); + err = mpam_apply_config(dom->ctrl_comp, partid, &cfg); + if (err) + return err; + + partid = resctrl_get_config_index(closid, CDP_DATA); + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + } + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); } diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 5a78299ec464..d329b1dc148b 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -56,6 +56,8 @@ void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); /** * mpam_register_requestor() - Register a requestor with the MPAM driver From 01a0021f6c39557037bfc41ede7230a0696677ff Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:58 +0000 Subject: [PATCH 061/109] arm_mpam: resctrl: Hide CDP emulation behind CONFIG_EXPERT When CDP is not enabled, the 'rmid_entry's in the limbo list, rmid_busy_llc, map directly to a (PARTID,PMG) pair and when CDP is enabled the mapping is to two different pairs. As the limbo list is reused between mounts and CDP disabled on unmount this can lead to stale mapping and the limbo handler will then make monitor reads with potentially out of range PARTID. This may then cause an MPAM error interrupt and the driver will disable MPAM. No problems are expected if you just mount the resctrl file system once with CDP enabled and never unmount it. Hide CDP emulation behind CONFIG_EXPERT to protect the unwary. Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Zeng Heng Reviewed-by: James Morse Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 2111542f485e..2331e6ddb814 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -82,6 +82,18 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; int cpu; + if (!IS_ENABLED(CONFIG_EXPERT) && enable) { + /* + * If the resctrl fs is mounted more than once, sequentially, + * then CDP can lead to the use of out of range PARTIDs. + */ + pr_warn("CDP not supported\n"); + return -EOPNOTSUPP; + } + + if (enable) + pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n"); + /* * resctrl_arch_set_cdp_enabled() is only called with enable set to * false on error and unmount. From 80d147d293130ee3c8a395cbbea1813e26ab9a1b Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 13 Mar 2026 14:45:59 +0000 Subject: [PATCH 062/109] arm_mpam: resctrl: Convert to/from MPAMs fixed-point formats MPAM uses a fixed-point formats for some hardware controls. Resctrl provides the bandwidth controls as a percentage. Add helpers to convert between these. Ensure bwa_wd is at most 16 to make it clear higher values have no meaning. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Dave Martin Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_devices.c | 7 +++++ drivers/resctrl/mpam_resctrl.c | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0e5e24ef60fe..0c97f7708722 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -713,6 +713,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_mbw_part, props); props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + + /* + * The BWA_WD field can represent 0-63, but the control fields it + * describes have a maximum of 16 bits. + */ + props->bwa_wd = min(props->bwa_wd, 16); + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) mpam_set_feature(mpam_feat_mbw_max, props); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 2331e6ddb814..240a06df2f07 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -242,6 +243,56 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +/* + * Each fixed-point hardware value architecturally represents a range + * of values: the full range 0% - 100% is split contiguously into + * (1 << cprops->bwa_wd) equal bands. + * + * Although the bwa_bwd fields have 6 bits the maximum valid value is 16 + * as it reports the width of fields that are at most 16 bits. When + * fewer than 16 bits are valid the least significant bits are + * ignored. The implied binary point is kept between bits 15 and 16 and + * so the valid bits are leftmost. + * + * See ARM IHI0099B.a "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format" for more information. + * + * Find the nearest percentage value to the upper bound of the selected band: + */ +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + u32 val = mbw_max; + + val >>= 16 - cprops->bwa_wd; + val += 1; + val *= MAX_MBA_BW; + val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + + return val; +} + +/* + * Find the band whose upper bound is closest to the specified percentage. + * + * A round-to-nearest policy is followed here as a balanced compromise + * between unexpected under-commit of the resource (where the total of + * a set of resource allocations after conversion is less than the + * expected total, due to rounding of the individual converted + * percentages) and over-commit (where the total of the converted + * allocations is greater than expected). + */ +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + u32 val = pc; + + val <<= cprops->bwa_wd; + val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); + val = max(val, 1) - 1; + val <<= 16 - cprops->bwa_wd; + + return val; +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { From 3e9b35823aabcb85cc039960256426e50f1fd601 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:00 +0000 Subject: [PATCH 063/109] arm_mpam: resctrl: Add rmid index helpers Because MPAM's pmg aren't identical to RDT's rmid, resctrl handles some data structures by index. This allows x86 to map indexes to RMID, and MPAM to map them to partid-and-pmg. Add the helpers to do this. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Suggested-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 16 ++++++++++++++++ include/linux/arm_mpam.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 240a06df2f07..370830ab1119 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -145,6 +145,22 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +u32 resctrl_arch_system_num_rmid_idx(void) +{ + return (mpam_pmg_max + 1) * (mpam_partid_max + 1); +} + +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) +{ + return closid * (mpam_pmg_max + 1) + rmid; +} + +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *closid = idx / (mpam_pmg_max + 1); + *rmid = idx % (mpam_pmg_max + 1); +} + void resctrl_arch_sched_in(struct task_struct *tsk) { lockdep_assert_preemption_disabled(); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index d329b1dc148b..7d23c90f077d 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -58,6 +58,9 @@ void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); +u32 resctrl_arch_system_num_rmid_idx(void); /** * mpam_register_requestor() - Register a requestor with the MPAM driver From 1c1e2968a860c5af9fca67f1c0e88aab83ace0b3 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:01 +0000 Subject: [PATCH 064/109] arm_mpam: resctrl: Wait for cacheinfo to be ready In order to calculate the rmid realloc threshold the size of the cache needs to be known. Cache domains will also be named after the cache id. So that this information can be extracted from cacheinfo we need to wait for it to be ready. The cacheinfo information is populated in device_initcall() so we wait for that. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 370830ab1119..bf91cff05daf 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -42,6 +43,13 @@ static DEFINE_MUTEX(domain_list_lock); */ static bool cdp_enabled; +/* + * We use cacheinfo to discover the size of the caches and their id. cacheinfo + * populates this from a device_initcall(). mpam_resctrl_setup() must wait. + */ +static bool cacheinfo_ready; +static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -757,6 +765,8 @@ int mpam_resctrl_setup(void) struct mpam_resctrl_res *res; enum resctrl_res_level rid; + wait_event(wait_cacheinfo_ready, cacheinfo_ready); + cpus_read_lock(); for_each_mpam_resctrl_control(res, rid) { INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); @@ -794,3 +804,12 @@ int mpam_resctrl_setup(void) return 0; } + +static int __init __cacheinfo_ready(void) +{ + cacheinfo_ready = true; + wake_up(&wait_cacheinfo_ready); + + return 0; +} +device_initcall_sync(__cacheinfo_ready); From 36528c7681b8093f5f9270d2af7c4326d771f181 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:02 +0000 Subject: [PATCH 065/109] arm_mpam: resctrl: Add support for 'MB' resource resctrl supports 'MB', as a percentage throttling of traffic from the L3. This is the control that mba_sc uses, so ideally the class chosen should be as close as possible to the counters used for mbm_total. If there is a single L3, it's the last cache, and the topology of the memory matches then the traffic at the memory controller will be equivalent to that at egress of the L3. If these conditions are met allow the memory class to back MB. MB's percentage control should be backed either with the fixed point fraction MBW_MAX or bandwidth portion bitmaps. The bandwidth portion bitmaps is not used as its tricky to pick which bits to use to avoid contention, and may be possible to expose this as something other than a percentage in the future. Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Dave Martin Signed-off-by: Dave Martin Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 281 ++++++++++++++++++++++++++++++++- 1 file changed, 280 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index bf91cff05daf..60d111f7abfd 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -267,6 +267,33 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +static bool mba_class_use_mbw_max(struct mpam_props *cprops) +{ + return (mpam_has_feature(mpam_feat_mbw_max, cprops) && + cprops->bwa_wd); +} + +static bool class_has_usable_mba(struct mpam_props *cprops) +{ + return mba_class_use_mbw_max(cprops); +} + +/* + * Calculate the worst-case percentage change from each implemented step + * in the control. + */ +static u32 get_mba_granularity(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) + return 0; + + /* + * bwa_wd is the number of bits implemented in the 0.xxx + * fixed point fraction. 1 bit is 50%, 2 is 25% etc. + */ + return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); +} + /* * Each fixed-point hardware value architecturally represents a range * of values: the full range 0% - 100% is split contiguously into @@ -317,6 +344,160 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return val; } +static u32 get_mba_min(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) { + WARN_ON_ONCE(1); + return 0; + } + + return mbw_max_to_percent(0, cprops); +} + +/* Find the L3 cache that has affinity with this CPU */ +static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) +{ + u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); + + lockdep_assert_cpus_held(); + + return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); +} + +/* + * topology_matches_l3() - Is the provided class the same shape as L3 + * @victim: The class we'd like to pretend is L3. + * + * resctrl expects all the world's a Xeon, and all counters are on the + * L3. We allow some mapping counters on other classes. This requires + * that the CPU->domain mapping is the same kind of shape. + * + * Using cacheinfo directly would make this work even if resctrl can't + * use the L3 - but cacheinfo can't tell us anything about offline CPUs. + * Using the L3 resctrl domain list also depends on CPUs being online. + * Using the mpam_class we picked for L3 so we can use its domain list + * assumes that there are MPAM controls on the L3. + * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() + * helper which can tell us about offline CPUs ... but getting the cache_id + * to start with relies on at least one CPU per L3 cache being online at + * boot. + * + * Walk the victim component list and compare the affinity mask with the + * corresponding L3. The topology matches if each victim:component's affinity + * mask is the same as the CPU's corresponding L3's. These lists/masks are + * computed from firmware tables so don't change at runtime. + */ +static bool topology_matches_l3(struct mpam_class *victim) +{ + int cpu, err; + struct mpam_component *victim_iter; + + lockdep_assert_cpus_held(); + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) + return false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(victim_iter, &victim->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_empty(&victim_iter->affinity)) { + pr_debug("class %u has CPU-less component %u - can't match L3!\n", + victim->level, victim_iter->comp_id); + return false; + } + + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return false; + + cpumask_clear(tmp_cpumask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3's equivalent component to class %u component %u\n", + victim->level, victim_iter->comp_id); + return false; + } + + /* Any differing bits in the affinity mask? */ + if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { + pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" + "L3:%*pbl != victim:%*pbl\n", + victim->level, victim_iter->comp_id, + cpumask_pr_args(tmp_cpumask), + cpumask_pr_args(&victim_iter->affinity)); + + return false; + } + } + + return true; +} + +/* + * Test if the traffic for a class matches that at egress from the L3. For + * MSC at memory controllers this is only possible if there is a single L3 + * as otherwise the counters at the memory can include bandwidth from the + * non-local L3. + */ +static bool traffic_matches_l3(struct mpam_class *class) +{ + int err, cpu; + + lockdep_assert_cpus_held(); + + if (class->type == MPAM_CLASS_CACHE && class->level == 3) + return true; + + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a different cache from L3\n", class->level); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", class->level); + return false; + } + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { + pr_debug("cpumask allocation failed\n"); + return false; + } + + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); + return false; + } + + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { + pr_debug("There is more than one L3\n"); + return false; + } + + /* Be strict; the traffic might stop in the intermediate cache. */ + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { + pr_debug("L3 isn't the last level of cache\n"); + return false; + } + + if (num_possible_nodes() > 1) { + pr_debug("There is more than one numa node\n"); + return false; + } + +#ifdef CONFIG_HMEM_REPORTING + if (node_devices[cpu_to_node(cpu)]->cache_dev) { + pr_debug("There is a memory side cache\n"); + return false; + } +#endif + + return true; +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { @@ -358,9 +539,68 @@ static void mpam_resctrl_pick_caches(void) } } +static void mpam_resctrl_pick_mba(void) +{ + struct mpam_class *class, *candidate_class = NULL; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_props *cprops = &class->props; + + if (class->level != 3 && class->type == MPAM_CLASS_CACHE) { + pr_debug("class %u is a cache but not the L3\n", class->level); + continue; + } + + if (!class_has_usable_mba(cprops)) { + pr_debug("class %u has no bandwidth control\n", + class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs\n", class->level); + continue; + } + + if (!topology_matches_l3(class)) { + pr_debug("class %u topology doesn't match L3\n", + class->level); + continue; + } + + if (!traffic_matches_l3(class)) { + pr_debug("class %u traffic doesn't match L3 egress\n", + class->level); + continue; + } + + /* + * Pick a resource to be MBA that as close as possible to + * the L3. mbm_total counts the bandwidth leaving the L3 + * cache and MBA should correspond as closely as possible + * for proper operation of mba_sc. + */ + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; + } + + if (candidate_class) { + pr_debug("selected class %u to back MBA\n", + candidate_class->level); + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + res->class = candidate_class; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; + struct mpam_props *cprops = &class->props; struct rdt_resource *r = &res->resctrl_res; switch (r->rid) { @@ -392,6 +632,19 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->cache.shareable_bits = resctrl_get_default_ctrl(r); r->alloc_capable = true; break; + case RDT_RESOURCE_MBA: + r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->ctrl_scope = RESCTRL_L3_CACHE; + + r->membw.delay_linear = true; + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = get_mba_min(cprops); + r->membw.max_bw = MAX_MBA_BW; + r->membw.bw_gran = get_mba_granularity(cprops); + + r->name = "MB"; + r->alloc_capable = true; + break; default: return -EINVAL; } @@ -406,7 +659,17 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; - /* TODO: repaint domain ids to match the L3 domain ids */ + if (topology_matches_l3(class)) { + /* Use the corresponding L3 component ID as the domain ID */ + int id = get_cpu_cacheinfo_id(cpu, 3); + + /* Implies topology_matches_l3() made a mistake */ + if (WARN_ON_ONCE(id == -1)) + return comp->comp_id; + + return id; + } + /* Otherwise, expose the ID used by the firmware table code. */ return comp->comp_id; } @@ -446,6 +709,12 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + configured_by = mpam_feat_mbw_max; + break; + } + fallthrough; default: return resctrl_get_default_ctrl(r); } @@ -457,6 +726,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, switch (configured_by) { case mpam_feat_cpor_part: return cfg->cpbm; + case mpam_feat_mbw_max: + return mbw_max_to_percent(cfg->mbw_max, cprops); default: return resctrl_get_default_ctrl(r); } @@ -504,6 +775,13 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } + fallthrough; default: return -EINVAL; } @@ -775,6 +1053,7 @@ int mpam_resctrl_setup(void) /* Find some classes to use for controls */ mpam_resctrl_pick_caches(); + mpam_resctrl_pick_mba(); /* Initialise the resctrl structures from the classes */ for_each_mpam_resctrl_control(res, rid) { From 5dc8f73eaa5dfccb229b9a25c797720e6379f8e0 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 13 Mar 2026 14:46:03 +0000 Subject: [PATCH 066/109] arm_mpam: resctrl: Add kunit test for control format conversions resctrl specifies the format of the control schemes, and these don't match the hardware. Some of the conversions are a bit hairy - add some kunit tests. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Dave Martin [morse: squashed enough of Dave's fixes in here that it's his patch now!] Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 4 + drivers/resctrl/test_mpam_resctrl.c | 315 ++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 drivers/resctrl/test_mpam_resctrl.c diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 60d111f7abfd..f8d4666fbaa8 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1092,3 +1092,7 @@ static int __init __cacheinfo_ready(void) return 0; } device_initcall_sync(__cacheinfo_ready); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_resctrl.c" +#endif diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c new file mode 100644 index 000000000000..b93d6ad87e43 --- /dev/null +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_resctrl.c */ + +#include +#include +#include +#include +#include + +struct percent_value_case { + u8 pc; + u8 width; + u16 value; +}; + +/* + * Mysterious inscriptions taken from the union of ARM DDI 0598D.b, + * "Arm Architecture Reference Manual Supplement - Memory System + * Resource Partitioning and Monitoring (MPAM), for A-profile + * architecture", Section 9.8, "About the fixed-point fractional + * format" (exact percentage entries only) and ARM IHI0099B.a + * "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format": + */ +static const struct percent_value_case percent_value_cases[] = { + /* Architectural cases: */ + { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e }, + { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff }, + { 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 }, + { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 }, + { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 }, + { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff }, + { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d }, + { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb }, + { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 }, + { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff }, + { 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb }, + { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 }, + { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 }, + { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff }, +}; + +static void test_percent_value_desc(const struct percent_value_case *param, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "pc=%d, width=%d, value=0x%.*x\n", + param->pc, param->width, + DIV_ROUND_UP(param->width, 4), param->value); +} + +KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases, + test_percent_value_desc); + +struct percent_value_test_info { + u32 pc; /* result of value-to-percent conversion */ + u32 value; /* result of percent-to-value conversion */ + u32 max_value; /* maximum raw value allowed by test params */ + unsigned int shift; /* promotes raw testcase value to 16 bits */ +}; + +/* + * Convert a reference percentage to a fixed-point MAX value and + * vice-versa, based on param (not test->param_value!) + */ +static void __prepare_percent_value_test(struct kunit *test, + struct percent_value_test_info *res, + const struct percent_value_case *param) +{ + struct mpam_props fake_props = { }; + + /* Reject bogus test parameters that would break the tests: */ + KUNIT_ASSERT_GE(test, param->width, 1); + KUNIT_ASSERT_LE(test, param->width, 16); + KUNIT_ASSERT_LT(test, param->value, 1 << param->width); + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = param->width; + + res->shift = 16 - param->width; + res->max_value = GENMASK_U32(param->width - 1, 0); + res->value = percent_to_mbw_max(param->pc, &fake_props); + res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props); +} + +static void test_get_mba_granularity(struct kunit *test) +{ + int ret; + struct mpam_props fake_props = { }; + + /* Use MBW_MAX */ + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + + fake_props.bwa_wd = 0; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props)); + + fake_props.bwa_wd = 1; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* Architectural maximum: */ + fake_props.bwa_wd = 16; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* No usable control... */ + fake_props.bwa_wd = 0; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); + + fake_props.bwa_wd = 1; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */ + + fake_props.bwa_wd = 2; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */ + + fake_props.bwa_wd = 3; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */ + + fake_props.bwa_wd = 6; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */ + + fake_props.bwa_wd = 7; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */ + + /* Granularity saturates at 1% */ + fake_props.bwa_wd = 16; /* architectural maximum */ + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ +} + +static void test_mbw_max_to_percent(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + /* + * Since the reference values in percent_value_cases[] all + * correspond to exact percentages, round-to-nearest will + * always give the exact percentage back when the MPAM max + * value has precision of 0.5% or finer. (Always true for the + * reference data, since they all specify 8 bits or more of + * precision. + * + * So, keep it simple and demand an exact match: + */ + __prepare_percent_value_test(test, &res, param); + KUNIT_EXPECT_EQ(test, res.pc, param->pc); +} + +static void test_percent_to_mbw_max(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + __prepare_percent_value_test(test, &res, param); + + KUNIT_EXPECT_GE(test, res.value, param->value << res.shift); + KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift); + KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift); + + /* No flexibility allowed for 0% and 100%! */ + + if (param->pc == 0) + KUNIT_EXPECT_EQ(test, res.value, 0); + + if (param->pc == 100) + KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift); +} + +static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t param = (uintptr_t)prev; + + if (param > 15) + return NULL; + + param++; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param); + + return (void *)param; +} + +static unsigned int test_get_bwa_wd(struct kunit *test) +{ + uintptr_t param = (uintptr_t)test->param_value; + + KUNIT_ASSERT_GE(test, param, 1); + KUNIT_ASSERT_LE(test, param, 16); + + return param; +} + +static void test_mbw_max_to_percent_limits(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + u32 max_value; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + max_value = GENMASK(15, 16 - fake_props.bwa_wd); + + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props), + MAX_MBA_BW); + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), + get_mba_min(&fake_props)); + + /* + * Rounding policy dependent 0% sanity-check: + * With round-to-nearest, the minimum mbw_max value really + * should map to 0% if there are at least 200 steps. + * (100 steps may be enough for some other rounding policies.) + */ + if (fake_props.bwa_wd >= 8) + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0); + + if (fake_props.bwa_wd < 8 && + mbw_max_to_percent(0, &fake_props) == 0) + kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?", + fake_props.bwa_wd); +} + +/* + * Check that converting a percentage to mbw_max and back again (or, as + * appropriate, vice-versa) always restores the original value: + */ +static void test_percent_max_roundtrip_stability(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + unsigned int shift; + u32 pc, max, pc2, max2; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + shift = 16 - fake_props.bwa_wd; + + /* + * Converting a valid value from the coarser scale to the finer + * scale and back again must yield the original value: + */ + if (fake_props.bwa_wd >= 7) { + /* More than 100 steps: only test exact pc values: */ + for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) { + max = percent_to_mbw_max(pc, &fake_props); + pc2 = mbw_max_to_percent(max, &fake_props); + KUNIT_EXPECT_EQ(test, pc2, pc); + } + } else { + /* Fewer than 100 steps: only test exact mbw_max values: */ + for (max = 0; max < 1 << 16; max += 1 << shift) { + pc = mbw_max_to_percent(max, &fake_props); + max2 = percent_to_mbw_max(pc, &fake_props); + KUNIT_EXPECT_EQ(test, max2, max); + } + } +} + +static void test_percent_to_max_rounding(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + unsigned int num_rounded_up = 0, total = 0; + struct percent_value_test_info res; + + for (param = percent_value_cases, total = 0; + param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)]; + param++, total++) { + __prepare_percent_value_test(test, &res, param); + if (res.value > param->value << res.shift) + num_rounded_up++; + } + + /* + * The MPAM driver applies a round-to-nearest policy, whereas a + * round-down policy seems to have been applied in the + * reference table from which the test vectors were selected. + * + * For a large and well-distributed suite of test vectors, + * about half should be rounded up and half down compared with + * the reference table. The actual test vectors are few in + * number and probably not very well distributed however, so + * tolerate a round-up rate of between 1/4 and 3/4 before + * crying foul: + */ + + kunit_info(test, "Round-up rate: %u%% (%u/%u)\n", + DIV_ROUND_CLOSEST(num_rounded_up * 100, total), + num_rounded_up, total); + + KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total); + KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); +} + +static struct kunit_case mpam_resctrl_test_cases[] = { + KUNIT_CASE(test_get_mba_granularity), + KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_percent_to_max_rounding), + KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, + test_all_bwa_wd_gen_params), + {} +}; + +static struct kunit_suite mpam_resctrl_test_suite = { + .name = "mpam_resctrl_test_suite", + .test_cases = mpam_resctrl_test_cases, +}; + +kunit_test_suites(&mpam_resctrl_test_suite); From 264c285999fce128fc52743bce582468b26e9f65 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:04 +0000 Subject: [PATCH 067/109] arm_mpam: resctrl: Add monitor initialisation and domain boilerplate Add the boilerplate that tells resctrl about the mpam monitors that are available. resctrl expects all (non-telemetry) monitors to be on the L3 and so advertise them there and invent an L3 resctrl resource if required. The L3 cache itself has to exist as the cache ids are used as the domain ids. Bring the resctrl monitor domains online and offline based on the cpus they contain. Support for specific monitor types is left to later. Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse --- drivers/resctrl/mpam_internal.h | 15 +++ drivers/resctrl/mpam_resctrl.c | 231 ++++++++++++++++++++++++++++++-- 2 files changed, 235 insertions(+), 11 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 2751eeaba302..301cf5c151bd 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -336,7 +336,16 @@ struct mpam_msc_ris { struct mpam_resctrl_dom { struct mpam_component *ctrl_comp; + + /* + * There is no single mon_comp because different events may be backed + * by different class/components. mon_comp is indexed by the event + * number. + */ + struct mpam_component *mon_comp[QOS_NUM_EVENTS]; + struct rdt_ctrl_domain resctrl_ctrl_dom; + struct rdt_l3_mon_domain resctrl_mon_dom; }; struct mpam_resctrl_res { @@ -345,6 +354,12 @@ struct mpam_resctrl_res { bool cdp_enabled; }; +struct mpam_resctrl_mon { + struct mpam_class *class; + + /* per-class data that resctrl needs will live here */ +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index f8d4666fbaa8..e03d0f400993 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -34,6 +34,23 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; rid < RDT_NUM_RESOURCES; \ rid++, res = &mpam_resctrl_controls[rid]) +/* + * The classes we've picked to map to resctrl events. + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This + * array lets us find the actual class backing the event counters. e.g. + * the only memory bandwidth counters may be on the memory controller, but to + * make use of them, we pretend they are on L3. Restrict the events considered + * to those supported by MPAM. + * Class pointer may be NULL. + */ +#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID +static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; + +#define for_each_mpam_resctrl_mon(mon, eventid) \ + for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ + eventid <= MPAM_MAX_EVENT; \ + eventid++, mon = &mpam_resctrl_counters[eventid]) + /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); @@ -63,6 +80,15 @@ bool resctrl_arch_alloc_capable(void) return false; } +bool resctrl_arch_mon_capable(void) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + /* All monitors are presented as being on the L3 cache */ + return l3->mon_capable; +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { return mpam_resctrl_controls[rid].cdp_enabled; @@ -89,6 +115,8 @@ static void resctrl_reset_task_closids(void) int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) { u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; int cpu; if (!IS_ENABLED(CONFIG_EXPERT) && enable) { @@ -110,6 +138,11 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) cdp_enabled = enable; mpam_resctrl_controls[rid].cdp_enabled = enable; + if (enable) + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2; + else + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; @@ -674,6 +707,56 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + /* + * There also needs to be an L3 cache present. + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1) + return 0; + + /* + * If there are no MPAM resources on L3, force it into existence. + * topology_matches_l3() already ensures this looks like the L3. + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init(). + */ + if (!res->class) { + pr_warn_once("Faking L3 MSC to enable counters.\n"); + res->class = mpam_resctrl_counters[type].class; + } + + /* + * Called multiple times!, once per event type that has a + * monitoring class. + * Setting name is necessary on monitor only platforms. + */ + l3->name = "L3"; + l3->mon_scope = RESCTRL_L3_CACHE; + + /* + * num-rmid is the upper bound for the number of monitoring groups that + * can exist simultaneously, including the default monitoring group for + * each control group. Hence, advertise the whole rmid_idx space even + * though each control group has its own pmg/rmid space. Unfortunately, + * this does mean userspace needs to know the architecture to correctly + * interpret this value. + */ + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + + if (resctrl_enable_mon_event(type, false, 0, NULL)) + l3->mon_capable = true; + + return 0; +} + u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -901,11 +984,26 @@ static void mpam_resctrl_domain_insert(struct list_head *list, list_add_tail_rcu(&new->list, pos); } +static struct mpam_component *find_component(struct mpam_class *class, int cpu) +{ + struct mpam_component *comp; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp->affinity)) + return comp; + } + + return NULL; +} + static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; struct mpam_class *class = res->class; struct mpam_component *comp_iter, *ctrl_comp; @@ -945,8 +1043,56 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) } else { pr_debug("Skipped control domain online - no controls\n"); } + + if (r->mon_capable) { + struct mpam_component *any_mon_comp; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + /* + * Even if the monitor domain is backed by a different + * component, the L3 component IDs need to be used... only + * there may be no ctrl_comp for the L3. + * Search each event's class list for a component with + * overlapping CPUs and set up the dom->mon_comp array. + */ + + for_each_mpam_resctrl_mon(mon, eventid) { + struct mpam_component *mon_comp; + + if (!mon->class) + continue; // dummy resource + + mon_comp = find_component(mon->class, cpu); + dom->mon_comp[eventid] = mon_comp; + if (mon_comp) + any_mon_comp = mon_comp; + } + if (!any_mon_comp) { + WARN_ON_ONCE(0); + err = -EFAULT; + goto offline_ctrl_domain; + } + + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); + mon_d->hdr.type = RESCTRL_MON_DOMAIN; + err = resctrl_online_mon_domain(r, &mon_d->hdr); + if (err) + goto offline_ctrl_domain; + + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); + } else { + pr_debug("Skipped monitor domain online - no monitors\n"); + } + return dom; +offline_ctrl_domain: + if (r->alloc_capable) { + mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + resctrl_offline_ctrl_domain(r, ctrl_d); + } free_domain: kfree(dom); dom = ERR_PTR(err); @@ -954,6 +1100,35 @@ free_domain: return dom; } +/* + * We know all the monitors are associated with the L3, even if there are no + * controls and therefore no control component. Find the cache-id for the CPU + * and use that to search for existing resctrl domains. + * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id + * for anything that is not a cache. + */ +static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +{ + int cache_id; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + lockdep_assert_cpus_held(); + + if (!l3->class) + return NULL; + cache_id = get_cpu_cacheinfo_id(cpu, 3); + if (cache_id < 0) + return NULL; + + list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (dom->resctrl_mon_dom.hdr.id == cache_id) + return dom; + } + + return NULL; +} + static struct mpam_resctrl_dom * mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) { @@ -967,7 +1142,11 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return dom; } - return NULL; + if (r->rid != RDT_RESOURCE_L3) + return NULL; + + /* Search the mon domain list too - needed on monitor only platforms. */ + return mpam_resctrl_get_mon_domain_from_cpu(cpu); } int mpam_resctrl_online_cpu(unsigned int cpu) @@ -994,6 +1173,11 @@ int mpam_resctrl_online_cpu(unsigned int cpu) mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); } + if (r->mon_capable) { + struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; + + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } } } @@ -1012,8 +1196,9 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) guard(mutex)(&domain_list_lock); for_each_mpam_resctrl_control(res, rid) { struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - bool ctrl_dom_empty; + bool ctrl_dom_empty, mon_dom_empty; struct rdt_resource *r = &res->resctrl_res; if (!res->class) @@ -1032,7 +1217,16 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) ctrl_dom_empty = true; } - if (ctrl_dom_empty) + if (r->mon_capable) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } else { + mon_dom_empty = true; + } + + if (ctrl_dom_empty && mon_dom_empty) kfree(dom); } } @@ -1042,12 +1236,15 @@ int mpam_resctrl_setup(void) int err = 0; struct mpam_resctrl_res *res; enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; wait_event(wait_cacheinfo_ready, cacheinfo_ready); cpus_read_lock(); for_each_mpam_resctrl_control(res, rid) { INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); res->resctrl_res.rid = rid; } @@ -1063,25 +1260,37 @@ int mpam_resctrl_setup(void) err = mpam_resctrl_control_init(res); if (err) { pr_debug("Failed to initialise rid %u\n", rid); - break; + goto internal_error; } } - cpus_read_unlock(); - if (err) { - pr_debug("Internal error %d - resctrl not supported\n", err); - return err; + for_each_mpam_resctrl_mon(mon, eventid) { + if (!mon->class) + continue; // dummy resource + + err = mpam_resctrl_monitor_init(mon, eventid); + if (err) { + pr_debug("Failed to initialise event %u\n", eventid); + goto internal_error; + } } - if (!resctrl_arch_alloc_capable()) { - pr_debug("No alloc(%u) found - resctrl not supported\n", - resctrl_arch_alloc_capable()); + cpus_read_unlock(); + + if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); return -EOPNOTSUPP; } /* TODO: call resctrl_init() */ return 0; + +internal_error: + cpus_read_unlock(); + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; } static int __init __cacheinfo_ready(void) From 1458c4f053355f88cc5d190ca02243d2c60fa010 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:05 +0000 Subject: [PATCH 068/109] arm_mpam: resctrl: Add support for csu counters resctrl exposes a counter via a file named llc_occupancy. This isn't really a counter as its value goes up and down, this is a snapshot of the cache storage usage monitor. Add some picking code which will only find an L3. The resctrl counter file is called llc_occupancy but we don't check it is the last one as it is already identified as L3. Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Dave Martin Signed-off-by: Dave Martin Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 83 ++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e03d0f400993..07bb20a01b38 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -311,6 +311,28 @@ static bool class_has_usable_mba(struct mpam_props *cprops) return mba_class_use_mbw_max(cprops); } +static bool cache_has_usable_csu(struct mpam_class *class) +{ + struct mpam_props *cprops; + + if (!class) + return false; + + cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return false; + + /* + * CSU counters settle on the value, so we can get away with + * having only one. + */ + if (!cprops->num_csu_mon) + return false; + + return true; +} + /* * Calculate the worst-case percentage change from each implemented step * in the control. @@ -630,6 +652,64 @@ static void mpam_resctrl_pick_mba(void) } } +static void counter_update_class(enum resctrl_event_id evt_id, + struct mpam_class *class) +{ + struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + + if (existing_class) { + if (class->level == 3) { + pr_debug("Existing class is L3 - L3 wins\n"); + return; + } + + if (existing_class->level < class->level) { + pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n", + existing_class->level, class->level); + return; + } + } + + mpam_resctrl_counters[evt_id].class = class; +} + +static void mpam_resctrl_pick_counters(void) +{ + struct mpam_class *class; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + /* The name of the resource is L3... */ + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a cache but not the L3", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u does not cover all CPUs", + class->level); + continue; + } + + if (cache_has_usable_csu(class)) { + pr_debug("class %u has usable CSU", + class->level); + + /* CSU counters only make sense on a cache. */ + switch (class->type) { + case MPAM_CLASS_CACHE: + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); + break; + default: + break; + } + } + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; @@ -1264,6 +1344,9 @@ int mpam_resctrl_setup(void) } } + /* Find some classes to use for monitors */ + mpam_resctrl_pick_counters(); + for_each_mpam_resctrl_mon(mon, eventid) { if (!mon->class) continue; // dummy resource From 2a3c79c61539779a09928893518c8286d7774b54 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:06 +0000 Subject: [PATCH 069/109] arm_mpam: resctrl: Allow resctrl to allocate monitors When resctrl wants to read a domain's 'QOS_L3_OCCUP', it needs to allocate a monitor on the corresponding resource. Monitors are allocated by class instead of component. Add helpers to allocate a CSU monitor. These helper return an out of range value for MBM counters. Allocating a montitor context is expected to block until hardware resources become available. This only makes sense for QOS_L3_OCCUP as unallocated MBM counters are losing data. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_internal.h | 14 ++++++- drivers/resctrl/mpam_resctrl.c | 67 +++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 301cf5c151bd..85b2b9926360 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -29,6 +29,14 @@ struct platform_device; #define PACKED_FOR_KUNIT #endif +/* + * This 'mon' values must not alias an actual monitor, so must be larger than + * U16_MAX, but not be confused with an errno value, so smaller than + * (u32)-SZ_4K. + * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor. + */ +#define USE_PRE_ALLOCATED (U16_MAX + 1) + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -216,7 +224,11 @@ enum mon_filter_options { }; struct mon_cfg { - u16 mon; + /* + * mon must be large enough to hold out of range values like + * USE_PRE_ALLOCATED + */ + u32 mon; u8 pmg; bool match_pmg; bool csu_exclude_clean; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 07bb20a01b38..9682ffb15184 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -22,6 +22,8 @@ #include "mpam_internal.h" +DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); + /* * The classes we've picked to map to resctrl resources, wrapped * in with their resctrl structure. @@ -289,6 +291,71 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return -EINVAL; + + switch (evtid) { + case QOS_L3_OCCUP_EVENT_ID: + /* With CDP, one monitor gets used for both code/data reads */ + return mpam_alloc_csu_mon(mon->class); + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + return USE_PRE_ALLOCATED; + default: + return -EOPNOTSUPP; + } +} + +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + DEFINE_WAIT(wait); + int *ret; + + ret = kmalloc_obj(*ret); + if (!ret) + return ERR_PTR(-ENOMEM); + + do { + prepare_to_wait(&resctrl_mon_ctx_waiters, &wait, + TASK_INTERRUPTIBLE); + *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid); + if (*ret == -ENOSPC) + schedule(); + } while (*ret == -ENOSPC && !signal_pending(current)); + finish_wait(&resctrl_mon_ctx_waiters, &wait); + + return ret; +} + +static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, + u32 mon_idx) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return; + + if (evtid == QOS_L3_OCCUP_EVENT_ID) + mpam_free_csu_mon(mon->class, mon_idx); + + wake_up(&resctrl_mon_ctx_waiters); +} + +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, void *arch_mon_ctx) +{ + u32 mon_idx = *(u32 *)arch_mon_ctx; + + kfree(arch_mon_ctx); + + resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7d23c90f077d..e1461e32af75 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include struct mpam_msc; @@ -62,6 +63,10 @@ u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); u32 resctrl_arch_system_num_rmid_idx(void); +struct rdt_resource; +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From fb56b29932ca276df268806ad52ed80f40f99a6e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:07 +0000 Subject: [PATCH 070/109] arm_mpam: resctrl: Add resctrl_arch_rmid_read() resctrl uses resctrl_arch_rmid_read() to read counters. CDP emulation means the counter may need reading in three different ways. The helpers behind the resctrl_arch_ functions will be re-used for the ABMC equivalent functions. Add the rounding helper for checking monitor values while we're here. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Jesse Chick Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 82 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 2 files changed, 87 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9682ffb15184..9a15ddd340f7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -356,6 +356,88 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } +static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +{ + struct mon_cfg cfg; + + if (!mpam_is_enabled()) + return -EINVAL; + + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); + + if (irqs_disabled()) { + /* Check if we can access this domain without an IPI */ + return -EIO; + } + + cfg = (struct mon_cfg) { + .mon = mon_idx, + .match_pmg = true, + .partid = closid, + .pmg = rmid, + }; + + return mpam_msmon_read(mon_comp, &cfg, mon_type, val); +} + +static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, u32 closid, u32 rmid, u64 *val) +{ + if (cdp_enabled) { + u64 code_val = 0, data_val = 0; + int err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_CODE, closid, rmid, &code_val); + if (err) + return err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_DATA, closid, rmid, &data_val); + if (err) + return err; + + *val += code_val + data_val; + return 0; + } + + return __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_NONE, closid, rmid, val); +} + +/* MBWU when not in ABMC mode (not supported), and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, + u32 closid, u32 rmid, enum resctrl_event_id eventid, + void *arch_priv, u64 *val, void *arch_mon_ctx) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + u32 mon_idx = *(u32 *)arch_mon_ctx; + enum mpam_device_features mon_type; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + resctrl_arch_rmid_read_context_check(); + + if (eventid >= QOS_NUM_EVENTS || !mon->class) + return -EINVAL; + + l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); + mon_comp = l3_dom->mon_comp[eventid]; + + if (eventid != QOS_L3_OCCUP_EVENT_ID) + return -EINVAL; + + mon_type = mpam_feat_msmon_csu; + + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, + closid, rmid, val); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index e1461e32af75..86d5e326d2bd 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,11 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 49b04e401825431529e866470d8d2dcd8e9ef058 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:08 +0000 Subject: [PATCH 071/109] arm_mpam: resctrl: Update the rmid reallocation limit resctrl's limbo code needs to be told when the data left in a cache is small enough for the partid+pmg value to be re-allocated. x86 uses the cache size divided by the number of rmid users the cache may have. Do the same, but for the smallest cache, and with the number of partid-and-pmg users. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9a15ddd340f7..f82fff3519df 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -438,6 +438,42 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, closid, rmid, val); } +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static int update_rmid_limits(struct mpam_class *class) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + struct mpam_props *cprops = &class->props; + struct cacheinfo *ci; + + lockdep_assert_cpus_held(); + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return 0; + + /* + * Assume cache levels are the same size for all CPUs... + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); + if (!ci || ci->size == 0) { + pr_debug("Could not read cache size for class %u\n", + class->level); + return -EINVAL; + } + + if (!resctrl_rmid_realloc_limit || + ci->size < resctrl_rmid_realloc_limit) { + resctrl_rmid_realloc_limit = ci->size; + resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + } + + return 0; +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -850,6 +886,9 @@ static void mpam_resctrl_pick_counters(void) /* CSU counters only make sense on a cache. */ switch (class->type) { case MPAM_CLASS_CACHE: + if (update_rmid_limits(class)) + break; + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); break; default: From efc775eadce2c6e0921c21d9c29a7b6686022281 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:09 +0000 Subject: [PATCH 072/109] arm_mpam: resctrl: Add empty definitions for assorted resctrl functions A few resctrl features and hooks need to be provided, but aren't needed or supported on MPAM platforms. resctrl has individual hooks to separately enable and disable the closid/partid and rmid/pmg context switching code. For MPAM this is all the same thing, as the value in struct task_struct is used to cache the value that should be written to hardware. arm64's context switching code is enabled once MPAM is usable, but doesn't touch the hardware unless the value has changed. For now event configuration is not supported, and can be turned off by returning 'false' from resctrl_arch_is_evt_configurable(). The new io_alloc feature is not supported either, always return false from the enable helper to indicate and fail the enable. Add this, and empty definitions for the other hooks. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_resctrl.c | 65 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 9 +++++ 2 files changed, 74 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index f82fff3519df..777ecdc2d0f8 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -91,6 +91,71 @@ bool resctrl_arch_mon_capable(void) return l3->mon_capable; } +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return false; +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + return -EINVAL; +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return false; +} + +void resctrl_arch_pre_mount(void) +{ +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { return mpam_resctrl_controls[rid].cdp_enabled; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 86d5e326d2bd..f92a36187a52 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,15 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +/* + * The CPU configuration for MPAM is cheap to write, and is only written if it + * has changed. No need for fine grained enables. + */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) { return val; From 4aab135bda1661a795e4fe96418bf840833e1119 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:10 +0000 Subject: [PATCH 073/109] arm64: mpam: Select ARCH_HAS_CPU_RESCTRL Enough MPAM support is present to enable ARCH_HAS_CPU_RESCTRL. Let it rip^Wlink! ARCH_HAS_CPU_RESCTRL indicates resctrl can be enabled. It is enabled by the arch code simply because it has 'arch' in its name. This removes ARM_CPU_RESCTRL as a mimic of X86_CPU_RESCTRL. While here, move the ACPI dependency to the driver's Kconfig file. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/resctrl.h | 2 ++ drivers/resctrl/Kconfig | 7 +++++++ drivers/resctrl/Makefile | 2 +- 4 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/resctrl.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3170c67464fb..41a5b4ef86b4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2017,7 +2017,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" select ARM64_MPAM_DRIVER - select ACPI_MPAM if ACPI + select ARCH_HAS_CPU_RESCTRL help Memory System Resource Partitioning and Monitoring (MPAM) is an optional extension to the Arm architecture that allows each diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h new file mode 100644 index 000000000000..b506e95cf6e3 --- /dev/null +++ b/arch/arm64/include/asm/resctrl.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index c34e059c6e41..672abea3b03c 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,7 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" depends on ARM64 && ARM64_MPAM + select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) driver for System IP, e.g. caches and memory controllers. @@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST If unsure, say N. endif + +config ARM64_MPAM_RESCTRL_FS + bool + default y if ARM64_MPAM_DRIVER && RESCTRL_FS + select RESCTRL_RMID_DEPENDS_ON_CLOSID + select RESCTRL_ASSIGN_FIXED diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 40beaf999582..4f6d0e81f9b8 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o -mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o +mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG From fb481ec08699e9daf08ab839a79ab37b1bcca94d Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:11 +0000 Subject: [PATCH 074/109] arm_mpam: resctrl: Call resctrl_init() on platforms that can support resctrl Now that MPAM links against resctrl, call resctrl_init() to register the filesystem and setup resctrl's structures. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- drivers/resctrl/mpam_devices.c | 32 ++++++++++++++--- drivers/resctrl/mpam_internal.h | 4 +++ drivers/resctrl/mpam_resctrl.c | 63 ++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 5 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0c97f7708722..37b31a1cf376 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -73,6 +73,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +/* + * Whether resctrl has been setup. Used by cpuhp in preference to + * mpam_is_enabled(). The disable call after an error interrupt makes + * mpam_is_enabled() false before the cpuhp callbacks are made. + * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks). + */ +static bool mpam_resctrl_enabled; + /* * An MSC is a physical container for controls and monitors, each identified by * their RIS index. These share a base-address, interrupts and some MMIO @@ -1619,7 +1627,7 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) return mpam_resctrl_online_cpu(cpu); return 0; @@ -1665,7 +1673,7 @@ static int mpam_cpu_offline(unsigned int cpu) { struct mpam_msc *msc; - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) mpam_resctrl_offline_cpu(cpu); guard(srcu)(&mpam_srcu); @@ -2526,6 +2534,7 @@ static void mpam_enable_once(void) } static_branch_enable(&mpam_enabled); + mpam_resctrl_enabled = true; mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -2585,24 +2594,39 @@ static void mpam_reset_class(struct mpam_class *class) void mpam_disable(struct work_struct *ignored) { int idx; + bool do_resctrl_exit; struct mpam_class *class; struct mpam_msc *msc, *tmp; + if (mpam_is_enabled()) + static_branch_disable(&mpam_enabled); + mutex_lock(&mpam_cpuhp_state_lock); if (mpam_cpuhp_state) { cpuhp_remove_state(mpam_cpuhp_state); mpam_cpuhp_state = 0; } + + /* + * Removing the cpuhp state called mpam_cpu_offline() and told resctrl + * all the CPUs are offline. + */ + do_resctrl_exit = mpam_resctrl_enabled; + mpam_resctrl_enabled = false; mutex_unlock(&mpam_cpuhp_state_lock); - static_branch_disable(&mpam_enabled); + if (do_resctrl_exit) + mpam_resctrl_exit(); mpam_unregister_irqs(); idx = srcu_read_lock(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, - srcu_read_lock_held(&mpam_srcu)) + srcu_read_lock_held(&mpam_srcu)) { mpam_reset_class(class); + if (do_resctrl_exit) + mpam_resctrl_teardown_class(class); + } srcu_read_unlock(&mpam_srcu, idx); mutex_lock(&mpam_list_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 85b2b9926360..68906c6ebfb0 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -431,12 +431,16 @@ int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); +void mpam_resctrl_exit(void); int mpam_resctrl_online_cpu(unsigned int cpu); void mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_teardown_class(struct mpam_class *class); #else static inline int mpam_resctrl_setup(void) { return 0; } +static inline void mpam_resctrl_exit(void) { } static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } #endif /* CONFIG_RESCTRL_FS */ /* diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 777ecdc2d0f8..a9938006d0e6 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -69,6 +69,12 @@ static bool cdp_enabled; static bool cacheinfo_ready; static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); +/* + * If resctrl_init() succeeded, resctrl_exit() can be used to remove support + * for the filesystem in the event of an error. + */ +static bool resctrl_enabled; + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -360,6 +366,9 @@ static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return -EINVAL; + if (!mon->class) return -EINVAL; @@ -402,6 +411,9 @@ static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return; + if (!mon->class) return; @@ -488,6 +500,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, resctrl_arch_rmid_read_context_check(); + if (!mpam_is_enabled()) + return -EINVAL; + if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; @@ -1162,6 +1177,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + /* * No need to check the CPU as mpam_apply_config() doesn't care, and * resctrl_arch_update_domains() relies on this. @@ -1227,6 +1245,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { struct resctrl_staged_config *cfg = &d->staged_config[t]; @@ -1619,7 +1640,11 @@ int mpam_resctrl_setup(void) return -EOPNOTSUPP; } - /* TODO: call resctrl_init() */ + err = resctrl_init(); + if (err) + return err; + + WRITE_ONCE(resctrl_enabled, true); return 0; @@ -1629,6 +1654,42 @@ internal_error: return err; } +void mpam_resctrl_exit(void) +{ + if (!READ_ONCE(resctrl_enabled)) + return; + + WRITE_ONCE(resctrl_enabled, false); + resctrl_exit(); +} + +/* + * The driver is detaching an MSC from this class, if resctrl was using it, + * pull on resctrl_exit(). + */ +void mpam_resctrl_teardown_class(struct mpam_class *class) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + might_sleep(); + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == class) { + res->class = NULL; + break; + } + } + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == class) { + mon->class = NULL; + break; + } + } +} + static int __init __cacheinfo_ready(void) { cacheinfo_ready = true; From fa7745218c9828ac4849ef62bccad684aec0f422 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:12 +0000 Subject: [PATCH 075/109] arm_mpam: Add quirk framework The MPAM specification includes the MPAMF_IIDR, which serves to uniquely identify the MSC implementation through a combination of implementer details, product ID, variant, and revision. Certain hardware issues/errata can be resolved using software workarounds. Introduce a quirk framework to allow workarounds to be enabled based on the MPAMF_IIDR value. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Co-developed-by: James Morse Signed-off-by: James Morse --- drivers/resctrl/mpam_devices.c | 32 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 25 +++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 37b31a1cf376..e66631f3f732 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -630,6 +630,30 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static const struct mpam_quirk mpam_quirks[] = { + { NULL } /* Sentinel */ +}; + +static void mpam_enable_quirks(struct mpam_msc *msc) +{ + const struct mpam_quirk *quirk; + + for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) { + int err = 0; + + if (quirk->iidr != (msc->iidr & quirk->iidr_mask)) + continue; + + if (quirk->init) + err = quirk->init(msc, quirk); + + if (err) + continue; + + mpam_set_quirk(quirk->workaround, msc); + } +} + /* * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour * of NRDY, software can use this bit for any purpose" - so hardware might not @@ -864,8 +888,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) /* Grab an IDR value to find out how many RIS there are */ mutex_lock(&msc->part_sel_lock); idr = mpam_msc_read_idr(msc); + msc->iidr = mpam_read_partsel_reg(msc, IIDR); mutex_unlock(&msc->part_sel_lock); + mpam_enable_quirks(msc); + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); /* Use these values so partid/pmg always starts with a valid value */ @@ -1972,6 +1999,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props) * resulting safe value must be compatible with both. When merging values in * the tree, all the aliasing resources must be handled first. * On mismatch, parent is modified. + * Quirks on an MSC will apply to all MSC in that class. */ static void __props_mismatch(struct mpam_props *parent, struct mpam_props *child, bool alias) @@ -2091,6 +2119,7 @@ static void __props_mismatch(struct mpam_props *parent, * nobble the class feature, as we can't configure all the resources. * e.g. The L3 cache is composed of two resources with 13 and 17 portion * bitmaps respectively. + * Quirks on an MSC will apply to all MSC in that class. */ static void __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) @@ -2104,6 +2133,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", (long)cprops->features, (long)vprops->features); + /* Merge quirks */ + class->quirks |= vmsc->msc->quirks; + /* Take the safe value for any common features */ __props_mismatch(cprops, vprops, false); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 68906c6ebfb0..01858365cd9e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -85,6 +85,8 @@ struct mpam_msc { u8 pmg_max; unsigned long ris_idxs; u32 ris_max; + u32 iidr; + u16 quirks; /* * error_irq_lock is taken when registering/unregistering the error @@ -216,6 +218,28 @@ struct mpam_props { #define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features) #define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features) +/* Workaround bits for msc->quirks */ +enum mpam_device_quirks { + MPAM_QUIRK_LAST +}; + +#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks)) +#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk))) + +struct mpam_quirk { + int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk); + + u32 iidr; + u32 iidr_mask; + + enum mpam_device_quirks workaround; +}; + +#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, @@ -259,6 +283,7 @@ struct mpam_class { struct mpam_props props; u32 nrdy_usec; + u16 quirks; u8 level; enum mpam_class_types type; From 70e81fbedc6570b2397e07a645136af0a0eec907 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:13 +0000 Subject: [PATCH 076/109] arm_mpam: Add workaround for T241-MPAM-1 The MPAM bandwidth partitioning controls will not be correctly configured, and hardware will retain default configuration register values, meaning generally that bandwidth will remain unprovisioned. To address the issue, follow the below steps after updating the MBW_MIN and/or MBW_MAX registers. - Perform 64b reads from all 12 bridge MPAM shadow registers at offsets (0x360048 + slice*0x10000 + partid*8). These registers are read-only. - Continue iterating until all 12 shadow register values match in a loop. pr_warn_once if the values fail to match within the loop count 1000. - Perform 64b writes with the value 0x0 to the two spare registers at offsets 0x1b0000 and 0x1c0000. In the hardware, writes to the MPAMCFG_MBW_MAX MPAMCFG_MBW_MIN registers are transformed into broadcast writes to the 12 shadow registers. The final two writes to the spare registers cause a final rank of downstream micro-architectural MPAM registers to be updated from the shadow copies. The intervening loop to read the 12 shadow registers helps avoid a race condition where writes to the spare registers occur before all shadow registers have been updated. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Signed-off-by: Ben Horgan Signed-off-by: James Morse --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 88 +++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 9 +++ 3 files changed, 99 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 4c300caad901..a65620f98e3a 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -247,6 +247,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e66631f3f732..b1fe6f171b68 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,6 +29,16 @@ #include "mpam_internal.h" +/* Values for the T241 errata workaround */ +#define T241_CHIPS_MAX 4 +#define T241_CHIP_NSLICES 12 +#define T241_SPARE_REG0_OFF 0x1b0000 +#define T241_SPARE_REG1_OFF 0x1c0000 +#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys) +#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8) +#define SMCCC_SOC_ID_T241 0x036b0241 +static void __iomem *t241_scratch_regs[T241_CHIPS_MAX]; + /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, @@ -630,7 +640,45 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc, + const struct mpam_quirk *quirk) +{ + s32 soc_id = arm_smccc_get_soc_id_version(); + struct resource *r; + phys_addr_t phys; + + /* + * A mapping to a device other than the MSC is needed, check + * SOC_ID is NVIDIA T241 chip (036b:0241) + */ + if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241) + return -EINVAL; + + r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0); + if (!r) + return -EINVAL; + + /* Find the internal registers base addr from the CHIP ID */ + msc->t241_id = T241_CHIP_ID(r->start); + phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL; + + t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M); + if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id])) + return -EINVAL; + + pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n"); + + return 0; +} + static const struct mpam_quirk mpam_quirks[] = { + { + /* NVIDIA t241 erratum T241-MPAM-1 */ + .init = mpam_enable_quirk_nvidia_t241_1, + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_SCRUB_SHADOW_REGS, + }, { NULL } /* Sentinel */ }; @@ -1378,6 +1426,44 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) __mpam_write_reg(msc, reg, bm); } +static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) +{ + int sidx, i, lcount = 1000; + void __iomem *regs; + u64 val0, val; + + regs = t241_scratch_regs[ris->vmsc->msc->t241_id]; + + for (i = 0; i < lcount; i++) { + /* Read the shadow register at index 0 */ + val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid)); + + /* Check if all the shadow registers have the same value */ + for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) { + val = readq_relaxed(regs + + T241_SHADOW_REG_OFF(sidx, partid)); + if (val != val0) + break; + } + if (sidx == T241_CHIP_NSLICES) + break; + } + + if (i == lcount) + pr_warn_once("t241: inconsistent values in shadow regs"); + + /* Write a value zero to spare registers to take effect of MBW conf */ + writeq_relaxed(0, regs + T241_SPARE_REG0_OFF); + writeq_relaxed(0, regs + T241_SPARE_REG1_OFF); +} + +static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) +{ + if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc)) + mpam_apply_t241_erratum(ris, partid); +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1457,6 +1543,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, PRI, pri_val); } + mpam_quirk_post_config_change(ris, partid, cfg); + mutex_unlock(&msc->part_sel_lock); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 01858365cd9e..d9eb342ba222 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -130,6 +130,9 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + /* Values only used on some platforms for quirks */ + u32 t241_id; + struct mpam_garbage garbage; }; @@ -220,6 +223,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { + T241_SCRUB_SHADOW_REGS, MPAM_QUIRK_LAST }; @@ -240,6 +244,11 @@ struct mpam_quirk { FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) +#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, From a7efe23ed6dd08259ad1b238e9c33bb511666fd4 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:14 +0000 Subject: [PATCH 077/109] arm_mpam: Add workaround for T241-MPAM-4 In the T241 implementation of memory-bandwidth partitioning, in the absence of contention for bandwidth, the minimum bandwidth setting can affect the amount of achieved bandwidth. Specifically, the achieved bandwidth in the absence of contention can settle to any value between the values of MPAMCFG_MBW_MIN and MPAMCFG_MBW_MAX. Also, if MPAMCFG_MBW_MIN is set zero (below 0.78125%), once a core enters a throttled state, it will never leave that state. The first issue is not a concern if the MPAM software allows to program MPAMCFG_MBW_MIN through the sysfs interface. This patch ensures program MBW_MIN=1 (0.78125%) whenever MPAMCFG_MBW_MIN=0 is programmed. In the scenario where the resctrl doesn't support the MBW_MIN interface via sysfs, to achieve bandwidth closer to MBW_MAX in the absence of contention, software should configure a relatively narrow gap between MBW_MIN and MBW_MAX. The recommendation is to use a 5% gap to mitigate the problem. Clear the feature MBW_MIN feature from the class to ensure we don't accidentally change behaviour when resctrl adds support for a MBW_MIN interface. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Signed-off-by: Ben Horgan Signed-off-by: James Morse --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 55 +++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index a65620f98e3a..a4b246655e37 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -249,6 +249,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index b1fe6f171b68..142e7ea960e5 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -679,6 +679,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_SCRUB_SHADOW_REGS, }, + { + /* NVIDIA t241 erratum T241-MPAM-4 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_FORCE_MBW_MIN_TO_ONE, + }, { NULL } /* Sentinel */ }; @@ -1464,6 +1470,37 @@ static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, mpam_apply_t241_erratum(ris, partid); } +static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + return min_hw_granule + 1; +} + +static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props, + struct mpam_config *cfg) +{ + u16 val = 0; + u16 max; + u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) { + max = cfg->mbw_max; + } else { + /* Resetting. Hence, use the ris specific default. */ + max = GENMASK(15, 16 - props->bwa_wd); + } + + if (max > delta) + val = max - delta; + + return val; +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1504,9 +1541,18 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); } - if (mpam_has_feature(mpam_feat_mbw_min, rprops) && - mpam_has_feature(mpam_feat_mbw_min, cfg)) - mpam_write_partsel_reg(msc, MBW_MIN, 0); + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { + u16 val = 0; + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) { + u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops); + + val = mpam_wa_t241_calc_min_from_max(rprops, cfg); + val = max(val, min); + } + + mpam_write_partsel_reg(msc, MBW_MIN, val); + } if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { if (mpam_has_feature(mpam_feat_mbw_max, cfg)) @@ -2288,6 +2334,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) __class_props_mismatch(class, vmsc); + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_clear_feature(mpam_feat_mbw_min, &class->props); } /* diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d9eb342ba222..f1adbdad3969 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -224,6 +224,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, + T241_FORCE_MBW_MIN_TO_ONE, MPAM_QUIRK_LAST }; From dc48eb1ff27cc3169c3c5cca5eb20645d04d9e22 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:15 +0000 Subject: [PATCH 078/109] arm_mpam: Add workaround for T241-MPAM-6 The registers MSMON_MBWU_L and MSMON_MBWU return the number of requests rather than the number of bytes transferred. Bandwidth resource monitoring is performed at the last level cache, where each request arrive in 64Byte granularity. The current implementation returns the number of transactions received at the last level cache but does not provide the value in bytes. Scaling by 64 gives an accurate byte count to match the MPAM specification for the MSMON_MBWU and MSMON_MBWU_L registers. This patch fixes the issue by reporting the actual number of bytes instead of the number of transactions from __ris_msmon_read(). Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Punit Agrawal Tested-by: Peter Newman Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Signed-off-by: Ben Horgan Signed-off-by: James Morse --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ drivers/resctrl/mpam_devices.c | 26 +++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index a4b246655e37..1aa3326bb320 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -251,6 +251,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 142e7ea960e5..1a92c8c42b59 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -685,6 +685,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_FORCE_MBW_MIN_TO_ONE, }, + { + /* NVIDIA t241 erratum T241-MPAM-6 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_MBW_COUNTER_SCALE_64, + }, { NULL } /* Sentinel */ }; @@ -1146,7 +1152,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, } } -static u64 mpam_msmon_overflow_val(enum mpam_device_features type) +static u64 __mpam_msmon_overflow_val(enum mpam_device_features type) { /* TODO: implement scaling counters */ switch (type) { @@ -1161,6 +1167,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type) } } +static u64 mpam_msmon_overflow_val(enum mpam_device_features type, + struct mpam_msc *msc) +{ + u64 overflow_val = __mpam_msmon_overflow_val(type); + + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + type != mpam_feat_msmon_mbwu_63counter) + overflow_val *= 64; + + return overflow_val; +} + static void __ris_msmon_read(void *arg) { u64 now; @@ -1251,13 +1269,17 @@ static void __ris_msmon_read(void *arg) now = FIELD_GET(MSMON___VALUE, now); } + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + m->type != mpam_feat_msmon_mbwu_63counter) + now *= 64; + if (nrdy) break; mbwu_state = &ris->mbwu_state[ctx->mon]; if (overflow) - mbwu_state->correction += mpam_msmon_overflow_val(m->type); + mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc); /* * Include bandwidth consumed before the last hardware reset and diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index f1adbdad3969..8fea28c5fb85 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -225,6 +225,7 @@ struct mpam_props { enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, + T241_MBW_COUNTER_SCALE_64, MPAM_QUIRK_LAST }; From aeb8595a5f8ba4aac8b5c265a8bcc3f18b473cb5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:16 +0000 Subject: [PATCH 079/109] arm_mpam: Quirk CMN-650's CSU NRDY behaviour CMN-650 is afflicted with an erratum where the CSU NRDY bit never clears. This tells us the monitor never finishes scanning the cache. The erratum document says to wait the maximum time, then ignore the field. Add a flag to indicate whether this is the final attempt to read the counter, and when this quirk is applied, ignore the NRDY field. This means accesses to this counter will always retry, even if the counter was previously programmed to the same values. The counter value is not expected to be stable, it drifts up and down with each allocation and eviction. The CSU register provides the value for a point in time. Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse --- Documentation/arch/arm64/silicon-errata.rst | 3 +++ drivers/resctrl/mpam_devices.c | 12 ++++++++++++ drivers/resctrl/mpam_internal.h | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 1aa3326bb320..65ed6ea33751 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -214,6 +214,9 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | SI L1 | #4311569 | ARM64_ERRATUM_4311569 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | CMN-650 | #3642720 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 | +----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 | diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 1a92c8c42b59..3e419e59a36f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -691,6 +691,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_MBW_COUNTER_SCALE_64, }, + { + /* ARM CMN-650 CSU erratum 3642720 */ + .iidr = MPAM_IIDR_ARM_CMN_650, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = IGNORE_CSU_NRDY, + }, { NULL } /* Sentinel */ }; @@ -1003,6 +1009,7 @@ struct mon_read { enum mpam_device_features type; u64 *val; int err; + bool waited_timeout; }; static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) @@ -1249,6 +1256,10 @@ static void __ris_msmon_read(void *arg) if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); + + if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) + nrdy = false; + break; case mpam_feat_msmon_mbwu_31counter: case mpam_feat_msmon_mbwu_44counter: @@ -1386,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, .ctx = ctx, .type = type, .val = val, + .waited_timeout = true, }; *val = 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8fea28c5fb85..1914aefdcba9 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -226,6 +226,7 @@ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, T241_MBW_COUNTER_SCALE_64, + IGNORE_CSU_NRDY, MPAM_QUIRK_LAST }; @@ -251,6 +252,11 @@ struct mpam_quirk { FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) +#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, From 4ce0a2ccc0358f3f746fa50815a599f861fd5d68 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:17 +0000 Subject: [PATCH 080/109] arm64: mpam: Add initial MPAM documentation MPAM (Memory Partitioning and Monitoring) is now exposed to user-space via resctrl. Add some documentation so the user knows what features to expect. Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse --- Documentation/arch/arm64/index.rst | 1 + Documentation/arch/arm64/mpam.rst | 72 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 Documentation/arch/arm64/mpam.rst diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index af52edc8c0ac..98052b4ef4a1 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -23,6 +23,7 @@ ARM64 Architecture memory memory-tagging-extension mops + mpam perf pointer-authentication ptdump diff --git a/Documentation/arch/arm64/mpam.rst b/Documentation/arch/arm64/mpam.rst new file mode 100644 index 000000000000..570f51a8d4eb --- /dev/null +++ b/Documentation/arch/arm64/mpam.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +MPAM +==== + +What is MPAM +============ +MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory +system components such as the caches or memory controllers that allow memory +traffic to be labelled, partitioned and monitored. + +Traffic is labelled by the CPU, based on the control or monitor group the +current task is assigned to using resctrl. Partitioning policy can be set +using the schemata file in resctrl, and monitor values read via resctrl. +See Documentation/filesystems/resctrl.rst for more details. + +This allows tasks that share memory system resources, such as caches, to be +isolated from each other according to the partitioning policy (so called noisy +neighbours). + +Supported Platforms +=================== +Use of this feature requires CPU support, support in the memory system +components, and a description from firmware of where the MPAM device controls +are in the MMIO address space. (e.g. the 'MPAM' ACPI table). + +The MMIO device that provides MPAM controls/monitors for a memory system +component is called a memory system component. (MSC). + +Because the user interface to MPAM is via resctrl, only MPAM features that are +compatible with resctrl can be exposed to user-space. + +MSC are considered as a group based on the topology. MSC that correspond with +the L3 cache are considered together, it is not possible to mix MSC between L2 +and L3 to 'cover' a resctrl schema. + +The supported features are: + +* Cache portion bitmap controls (CPOR) on the L2 or L3 caches. To expose + CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this + level that also supports the feature. Mismatched big/little platforms are + not supported as resctrl's controls would then also depend on task + placement. + +* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache. + resctrl uses the L3 cache-id to identify where the memory bandwidth + control is applied. For this reason the platform must have an L3 cache + with cache-id's supplied by firmware. (It doesn't need to support MPAM.) + + To be exported as the 'MB' schema, the topology of the group of MSC chosen + must match the topology of the L3 cache so that the cache-id's can be + repainted. For example: Platforms with Memory bandwidth maximum controls + on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these + nodes do not have a corresponding L3 cache. If the memory bandwidth + control is on the memory rather than the L3 then there must be a single + global L3 as otherwise it is unknown which L3 the traffic came from. There + must be no caches between the L3 and the memory so that the two ends of + the path have equivalent traffic. + + When the MPAM driver finds multiple groups of MSC it can use for the 'MB' + schema, it prefers the group closest to the L3 cache. + +* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided + there is at least one CSU monitor on each MSC that makes up the L3 group. + Exposing CSU counters from other caches or devices is not supported. + +Reporting Bugs +============== +If you are not seeing the counters or controls you expect please share the +debug messages produced when enabling dynamic debug and booting with: +dyndbg="file mpam_resctrl.c +pl" From e223258ed8a683d9debbb03ca1be0736f2c12e5b Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Sat, 14 Mar 2026 17:51:31 +0000 Subject: [PATCH 081/109] arm64: armv8_deprecated: Disable swp emulation when FEAT_LSUI present The purpose of supporting LSUI is to eliminate PAN toggling. CPUs that support LSUI are unlikely to support a 32-bit runtime. Rather than emulating the SWP instruction using LSUI instructions in order to remove PAN toggling, simply disable SWP emulation. Signed-off-by: Yeoreum Yun [catalin.marinas@arm.com: some tweaks to the in-code comment] Signed-off-by: Catalin Marinas --- arch/arm64/kernel/armv8_deprecated.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index e737c6295ec7..b7a1f8b788bb 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -610,6 +610,20 @@ static int __init armv8_deprecated_init(void) } #endif + +#ifdef CONFIG_SWP_EMULATION + /* + * The purpose of supporting LSUI is to eliminate PAN toggling. CPUs + * that support LSUI are unlikely to support a 32-bit runtime. Rather + * than emulating the SWP instruction using LSUI instructions, simply + * disable SWP emulation. + */ + if (cpus_have_final_cap(ARM64_HAS_LSUI)) { + insn_swp.status = INSN_UNAVAILABLE; + pr_info("swp/swpb instruction emulation is not supported on this system\n"); + } +#endif + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { struct insn_emulation *ie = insn_emulations[i]; From 48478b9f791376b4b89018d7afdfd06865498f65 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 9 Mar 2026 02:57:24 +0000 Subject: [PATCH 082/109] arm64/mm: Enable batched TLB flush in unmap_hotplug_range() During a memory hot remove operation, both linear and vmemmap mappings for the memory range being removed, get unmapped via unmap_hotplug_range() but mapped pages get freed only for vmemmap mapping. This is just a sequential operation where each table entry gets cleared, followed by a leaf specific TLB flush, and then followed by memory free operation when applicable. This approach was simple and uniform both for vmemmap and linear mappings. But linear mapping might contain CONT marked block memory where it becomes necessary to first clear out all entire in the range before a TLB flush. This is as per the architecture requirement. Hence batch all TLB flushes during the table tear down walk and finally do it in unmap_hotplug_range(). Prior to this fix, it was hypothetically possible for a speculative access to a higher address in the contiguous block to fill the TLB with shattered entries for the entire contiguous range after a lower address had already been cleared and invalidated. Due to the table entries being shattered, the subsequent TLB invalidation for the higher address would not then clear the TLB entries for the lower address, meaning stale TLB entries could persist. Besides it also helps in improving the performance via TLBI range operation along with reduced synchronization instructions. The time spent executing unmap_hotplug_range() improved 97% measured over a 2GB memory hot removal in KVM guest. This scheme is not applicable during vmemmap mapping tear down where memory needs to be freed and hence a TLB flush is required after clearing out page table entry. Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/ Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove") Cc: stable@vger.kernel.org Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Ryan Roberts Signed-off-by: Ryan Roberts Signed-off-by: Anshuman Khandual Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a6a00accf4f9..5dbf988120c8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1458,10 +1458,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, WARN_ON(!pte_present(pte)); __pte_clear(&init_mm, addr, ptep); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + /* CONT blocks are not supported in the vmemmap */ + WARN_ON(pte_cont(pte)); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); free_hotplug_page_range(pte_page(pte), PAGE_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ } while (addr += PAGE_SIZE, addr < end); } @@ -1482,15 +1486,14 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, WARN_ON(!pmd_present(pmd)); if (pmd_sect(pmd)) { pmd_clear(pmdp); - - /* - * One TLBI should be sufficient here as the PMD_SIZE - * range is mapped with a single block entry. - */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + /* CONT blocks are not supported in the vmemmap */ + WARN_ON(pmd_cont(pmd)); + flush_tlb_kernel_range(addr, addr + PMD_SIZE); free_hotplug_page_range(pmd_page(pmd), PMD_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ continue; } WARN_ON(!pmd_table(pmd)); @@ -1515,15 +1518,12 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, WARN_ON(!pud_present(pud)); if (pud_sect(pud)) { pud_clear(pudp); - - /* - * One TLBI should be sufficient here as the PUD_SIZE - * range is mapped with a single block entry. - */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + flush_tlb_kernel_range(addr, addr + PUD_SIZE); free_hotplug_page_range(pud_page(pud), PUD_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ continue; } WARN_ON(!pud_table(pud)); @@ -1553,6 +1553,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr, static void unmap_hotplug_range(unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { + unsigned long start = addr; unsigned long next; pgd_t *pgdp, pgd; @@ -1574,6 +1575,9 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end, WARN_ON(!pgd_present(pgd)); unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap); } while (addr = next, addr < end); + + if (!free_mapped) + flush_tlb_kernel_range(start, end); } static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, From 95a58852b0e5413b6ef4c93da60a80e89da9986a Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 9 Mar 2026 02:57:25 +0000 Subject: [PATCH 083/109] arm64/mm: Reject memory removal that splits a kernel leaf mapping Linear and vmemmap mappings that get torn down during a memory hot remove operation might contain leaf level entries on any page table level. If the requested memory range's linear or vmemmap mappings falls within such leaf entries, new mappings need to be created for the remaining memory mapped on the leaf entry earlier, following standard break before make aka BBM rules. But kernel cannot tolerate BBM and hence remapping to fine grained leaves would not be possible on systems without BBML2_NOABORT. Currently memory hot remove operation does not perform such restructuring, and so removing memory ranges that could split a kernel leaf level mapping need to be rejected. While memory_hotplug.c does appear to permit hot removing arbitrary ranges of memory, the higher layers that drive memory_hotplug (e.g. ACPI, virtio, ...) all appear to treat memory as fixed size devices. So it is impossible to hot unplug a different amount than was previously hot plugged, and hence we should never see a rejection in practice, but adding the check makes us robust against a future change. Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/ Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Ryan Roberts Suggested-by: Ryan Roberts Signed-off-by: Anshuman Khandual Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 120 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 6 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 5dbf988120c8..5fb9a66f0754 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -2014,6 +2014,107 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); } + +static bool addr_splits_kernel_leaf(unsigned long addr) +{ + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + /* + * If the given address points at a the start address of + * a possible leaf, we certainly won't split. Otherwise, + * check if we would actually split a leaf by traversing + * the page tables further. + */ + if (IS_ALIGNED(addr, PGDIR_SIZE)) + return false; + + pgdp = pgd_offset_k(addr); + pgd = pgdp_get(pgdp); + if (!pgd_present(pgd)) + return false; + + if (IS_ALIGNED(addr, P4D_SIZE)) + return false; + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + return false; + + if (IS_ALIGNED(addr, PUD_SIZE)) + return false; + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + return false; + + if (pud_leaf(pud)) + return true; + + if (IS_ALIGNED(addr, CONT_PMD_SIZE)) + return false; + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + return false; + + if (pmd_cont(pmd)) + return true; + + if (IS_ALIGNED(addr, PMD_SIZE)) + return false; + + if (pmd_leaf(pmd)) + return true; + + if (IS_ALIGNED(addr, CONT_PTE_SIZE)) + return false; + + ptep = pte_offset_kernel(pmdp, addr); + pte = __ptep_get(ptep); + if (!pte_present(pte)) + return false; + + if (pte_cont(pte)) + return true; + + return !IS_ALIGNED(addr, PAGE_SIZE); +} + +static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long phys_start, phys_end, start, end; + + phys_start = PFN_PHYS(pfn); + phys_end = phys_start + nr_pages * PAGE_SIZE; + + /* PFN range's linear map edges are leaf entry aligned */ + start = __phys_to_virt(phys_start); + end = __phys_to_virt(phys_end); + if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { + pr_warn("[%lx %lx] splits a leaf entry in linear map\n", + phys_start, phys_end); + return false; + } + + /* PFN range's vmemmap edges are leaf entry aligned */ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)); + start = (unsigned long)pfn_to_page(pfn); + end = (unsigned long)pfn_to_page(pfn + nr_pages); + if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { + pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n", + phys_start, phys_end); + return false; + } + return true; +} + /* * This memory hotplug notifier helps prevent boot memory from being * inadvertently removed as it blocks pfn range offlining process in @@ -2022,8 +2123,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) * In future if and when boot memory could be removed, this notifier * should be dropped and free_hotplug_page_range() should handle any * reserved pages allocated during boot. + * + * This also blocks any memory remove that would have caused a split + * in leaf entry in kernel linear or vmemmap mapping. */ -static int prevent_bootmem_remove_notifier(struct notifier_block *nb, +static int prevent_memory_remove_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct mem_section *ms; @@ -2069,11 +2173,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb, return NOTIFY_DONE; } } + + if (!can_unmap_without_split(pfn, arg->nr_pages)) + return NOTIFY_BAD; + return NOTIFY_OK; } -static struct notifier_block prevent_bootmem_remove_nb = { - .notifier_call = prevent_bootmem_remove_notifier, +static struct notifier_block prevent_memory_remove_nb = { + .notifier_call = prevent_memory_remove_notifier, }; /* @@ -2123,7 +2231,7 @@ static void validate_bootmem_online(void) } } -static int __init prevent_bootmem_remove_init(void) +static int __init prevent_memory_remove_init(void) { int ret = 0; @@ -2131,13 +2239,13 @@ static int __init prevent_bootmem_remove_init(void) return ret; validate_bootmem_online(); - ret = register_memory_notifier(&prevent_bootmem_remove_nb); + ret = register_memory_notifier(&prevent_memory_remove_nb); if (ret) pr_err("%s: Notifier registration failed %d\n", __func__, ret); return ret; } -early_initcall(prevent_bootmem_remove_init); +early_initcall(prevent_memory_remove_init); #endif pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, From 5e0deb0a6b4e07825c3b1c4317c5e413421266ed Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 18 Mar 2026 09:25:43 +0000 Subject: [PATCH 084/109] arm64: mm: Use generic enum pgtable_level enum pgtable_type was introduced for arm64 by commit c64f46ee1377 ("arm64: mm: use enum to identify pgtable level instead of *_SHIFT"). In the meantime, the generic enum pgtable_level got introduced by commit b22cc9a9c7ff ("mm/rmap: convert "enum rmap_level" to "enum pgtable_level""). Let's switch to the generic enum pgtable_level. The only difference is that it also includes PGD level; __pgd_pgtable_alloc() isn't expected to create PGD tables so we add a VM_WARN_ON() for that case. Suggested-by: David Hildenbrand (Arm) Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Ryan Roberts Signed-off-by: Kevin Brodsky Reviewed-by: Anshuman Khandual Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu.h | 7 ----- arch/arm64/mm/mmu.c | 59 +++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 137a173df1ff..f28e6e215da4 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -17,13 +17,6 @@ #include #include -enum pgtable_type { - TABLE_PTE, - TABLE_PMD, - TABLE_PUD, - TABLE_P4D, -}; - typedef struct { atomic64_t id; #ifdef CONFIG_COMPAT diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a6a00accf4f9..c45a6a47dad0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -112,7 +112,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type) +static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level) { phys_addr_t phys; @@ -197,7 +197,7 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { unsigned long next; @@ -212,7 +212,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(TABLE_PTE); + pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE); if (pte_phys == INVALID_PHYS_ADDR) return -ENOMEM; ptep = pte_set_fixmap(pte_phys); @@ -252,7 +252,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { unsigned long next; @@ -292,7 +292,7 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -311,7 +311,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(TABLE_PMD); + pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD); if (pmd_phys == INVALID_PHYS_ADDR) return -ENOMEM; pmdp = pmd_set_fixmap(pmd_phys); @@ -349,7 +349,7 @@ out: static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret = 0; @@ -364,7 +364,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(TABLE_PUD); + pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD); if (pud_phys == INVALID_PHYS_ADDR) return -ENOMEM; pudp = pud_set_fixmap(pud_phys); @@ -415,7 +415,7 @@ out: static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -430,7 +430,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); - p4d_phys = pgtable_alloc(TABLE_P4D); + p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D); if (p4d_phys == INVALID_PHYS_ADDR) return -ENOMEM; p4dp = p4d_set_fixmap(p4d_phys); @@ -467,7 +467,7 @@ out: static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -500,7 +500,7 @@ static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -516,7 +516,7 @@ static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -528,7 +528,7 @@ static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, } static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, - enum pgtable_type pgtable_type) + enum pgtable_level pgtable_level) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); @@ -539,40 +539,43 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, pa = page_to_phys(ptdesc_page(ptdesc)); - switch (pgtable_type) { - case TABLE_PTE: + switch (pgtable_level) { + case PGTABLE_LEVEL_PTE: BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); break; - case TABLE_PMD: + case PGTABLE_LEVEL_PMD: BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); break; - case TABLE_PUD: + case PGTABLE_LEVEL_PUD: pagetable_pud_ctor(ptdesc); break; - case TABLE_P4D: + case PGTABLE_LEVEL_P4D: pagetable_p4d_ctor(ptdesc); break; + case PGTABLE_LEVEL_PGD: + VM_WARN_ON(1); + break; } return pa; } static phys_addr_t -pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp) +pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp) { - return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type); + return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level); } static phys_addr_t __maybe_unused -pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) +pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level) { - return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL); + return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL); } static phys_addr_t -pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) +pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level) { - return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type); + return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level); } static void split_contpte(pte_t *ptep) @@ -593,7 +596,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) pte_t *ptep; int i; - pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp); + pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp); if (pte_phys == INVALID_PHYS_ADDR) return -ENOMEM; ptep = (pte_t *)phys_to_virt(pte_phys); @@ -638,7 +641,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) pmd_t *pmdp; int i; - pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp); + pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp); if (pmd_phys == INVALID_PHYS_ADDR) return -ENOMEM; pmdp = (pmd_t *)phys_to_virt(pmd_phys); @@ -1226,7 +1229,7 @@ static void __init declare_vma(struct vm_struct *vma, static phys_addr_t kpti_ng_temp_alloc __initdata; -static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type) +static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level) { kpti_ng_temp_alloc -= PAGE_SIZE; return kpti_ng_temp_alloc; From f12b435de2f2bb09ce406467020181ada528844c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 30 Mar 2026 17:17:02 +0100 Subject: [PATCH 085/109] arm64: mm: Fix rodata=full block mapping support for realm guests Commit a166563e7ec37 ("arm64: mm: support large block mapping when rodata=full") enabled the linear map to be mapped by block/cont while still allowing granular permission changes on BBML2_NOABORT systems by lazily splitting the live mappings. This mechanism was intended to be usable by realm guests since they need to dynamically share dma buffers with the host by "decrypting" them - which for Arm CCA, means marking them as shared in the page tables. However, it turns out that the mechanism was failing for realm guests because realms need to share their dma buffers (via __set_memory_enc_dec()) much earlier during boot than split_kernel_leaf_mapping() was able to handle. The report linked below showed that GIC's ITS was one such user. But during the investigation I found other callsites that could not meet the split_kernel_leaf_mapping() constraints. The problem is that we block map the linear map based on the boot CPU supporting BBML2_NOABORT, then check that all the other CPUs support it too when finalizing the caps. If they don't, then we stop_machine() and split to ptes. For safety, split_kernel_leaf_mapping() previously wouldn't permit splitting until after the caps were finalized. That ensured that if any secondary cpus were running that didn't support BBML2_NOABORT, we wouldn't risk breaking them. I've fix this problem by reducing the black-out window where we refuse to split; there are now 2 windows. The first is from T0 until the page allocator is inititialized. Splitting allocates memory for the page allocator so it must be in use. The second covers the period between starting to online the secondary cpus until the system caps are finalized (this is a very small window). All of the problematic callers are calling __set_memory_enc_dec() before the secondary cpus come online, so this solves the problem. However, one of these callers, swiotlb_update_mem_attributes(), was trying to split before the page allocator was initialized. So I have moved this call from arch_mm_preinit() to mem_init(), which solves the ordering issue. I've added warnings and return an error if any attempt is made to split in the black-out windows. Note there are other issues which prevent booting all the way to user space, which will be fixed in subsequent patches. Reported-by: Jinjiang Tu Closes: https://lore.kernel.org/all/0b2a4ae5-fc51-4d77-b177-b2e9db74f11d@huawei.com/ Fixes: a166563e7ec3 ("arm64: mm: support large block mapping when rodata=full") Cc: stable@vger.kernel.org Reviewed-by: Kevin Brodsky Signed-off-by: Ryan Roberts Reviewed-by: Suzuki K Poulose Tested-by: Suzuki K Poulose Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu.h | 2 ++ arch/arm64/mm/init.c | 9 +++++++- arch/arm64/mm/mmu.c | 45 +++++++++++++++++++++++++----------- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 137a173df1ff..472610433aae 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -112,5 +112,7 @@ void kpti_install_ng_mappings(void); static inline void kpti_install_ng_mappings(void) {} #endif +extern bool page_alloc_available; + #endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 96711b8578fd..b9b248d24fd1 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -350,7 +350,6 @@ void __init arch_mm_preinit(void) } swiotlb_init(swiotlb, flags); - swiotlb_update_mem_attributes(); /* * Check boundaries twice: Some fundamental inconsistencies can be @@ -377,6 +376,14 @@ void __init arch_mm_preinit(void) } } +bool page_alloc_available __ro_after_init; + +void __init mem_init(void) +{ + page_alloc_available = true; + swiotlb_update_mem_attributes(); +} + void free_initmem(void) { void *lm_init_begin = lm_alias(__init_begin); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a6a00accf4f9..223947487a22 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -768,30 +768,51 @@ static inline bool force_pte_mapping(void) } static DEFINE_MUTEX(pgtable_split_lock); +static bool linear_map_requires_bbml2; int split_kernel_leaf_mapping(unsigned long start, unsigned long end) { int ret; - /* - * !BBML2_NOABORT systems should not be trying to change permissions on - * anything that is not pte-mapped in the first place. Just return early - * and let the permission change code raise a warning if not already - * pte-mapped. - */ - if (!system_supports_bbml2_noabort()) - return 0; - /* * If the region is within a pte-mapped area, there is no need to try to * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may * change permissions from atomic context so for those cases (which are * always pte-mapped), we must not go any further because taking the - * mutex below may sleep. + * mutex below may sleep. Do not call force_pte_mapping() here because + * it could return a confusing result if called from a secondary cpu + * prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us + * what we need. */ - if (force_pte_mapping() || is_kfence_address((void *)start)) + if (!linear_map_requires_bbml2 || is_kfence_address((void *)start)) return 0; + if (!system_supports_bbml2_noabort()) { + /* + * !BBML2_NOABORT systems should not be trying to change + * permissions on anything that is not pte-mapped in the first + * place. Just return early and let the permission change code + * raise a warning if not already pte-mapped. + */ + if (system_capabilities_finalized()) + return 0; + + /* + * Boot-time: split_kernel_leaf_mapping_locked() allocates from + * page allocator. Can't split until it's available. + */ + if (WARN_ON(!page_alloc_available)) + return -EBUSY; + + /* + * Boot-time: Started secondary cpus but don't know if they + * support BBML2_NOABORT yet. Can't allow splitting in this + * window in case they don't. + */ + if (WARN_ON(num_online_cpus() > 1)) + return -EBUSY; + } + /* * Ensure start and end are at least page-aligned since this is the * finest granularity we can split to. @@ -891,8 +912,6 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp return ret; } -static bool linear_map_requires_bbml2 __initdata; - u32 idmap_kpti_bbml2_flag; static void __init init_idmap_kpti_bbml2_flag(void) From 15bfba1ad77fad8e45a37aae54b3c813b33fe27c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 30 Mar 2026 17:17:03 +0100 Subject: [PATCH 086/109] arm64: mm: Handle invalid large leaf mappings correctly It has been possible for a long time to mark ptes in the linear map as invalid. This is done for secretmem, kfence, realm dma memory un/share, and others, by simply clearing the PTE_VALID bit. But until commit a166563e7ec37 ("arm64: mm: support large block mapping when rodata=full") large leaf mappings were never made invalid in this way. It turns out various parts of the code base are not equipped to handle invalid large leaf mappings (in the way they are currently encoded) and I've observed a kernel panic while booting a realm guest on a BBML2_NOABORT system as a result: [ 15.432706] software IO TLB: Memory encryption is active and system is using DMA bounce buffers [ 15.476896] Unable to handle kernel paging request at virtual address ffff000019600000 [ 15.513762] Mem abort info: [ 15.527245] ESR = 0x0000000096000046 [ 15.548553] EC = 0x25: DABT (current EL), IL = 32 bits [ 15.572146] SET = 0, FnV = 0 [ 15.592141] EA = 0, S1PTW = 0 [ 15.612694] FSC = 0x06: level 2 translation fault [ 15.640644] Data abort info: [ 15.661983] ISV = 0, ISS = 0x00000046, ISS2 = 0x00000000 [ 15.694875] CM = 0, WnR = 1, TnD = 0, TagAccess = 0 [ 15.723740] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 15.755776] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000081f3f000 [ 15.800410] [ffff000019600000] pgd=0000000000000000, p4d=180000009ffff403, pud=180000009fffe403, pmd=00e8000199600704 [ 15.855046] Internal error: Oops: 0000000096000046 [#1] SMP [ 15.886394] Modules linked in: [ 15.900029] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 7.0.0-rc4-dirty #4 PREEMPT [ 15.935258] Hardware name: linux,dummy-virt (DT) [ 15.955612] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 15.986009] pc : __pi_memcpy_generic+0x128/0x22c [ 16.006163] lr : swiotlb_bounce+0xf4/0x158 [ 16.024145] sp : ffff80008000b8f0 [ 16.038896] x29: ffff80008000b8f0 x28: 0000000000000000 x27: 0000000000000000 [ 16.069953] x26: ffffb3976d261ba8 x25: 0000000000000000 x24: ffff000019600000 [ 16.100876] x23: 0000000000000001 x22: ffff0000043430d0 x21: 0000000000007ff0 [ 16.131946] x20: 0000000084570010 x19: 0000000000000000 x18: ffff00001ffe3fcc [ 16.163073] x17: 0000000000000000 x16: 00000000003fffff x15: 646e612065766974 [ 16.194131] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 [ 16.225059] x11: 0000000000000000 x10: 0000000000000010 x9 : 0000000000000018 [ 16.256113] x8 : 0000000000000018 x7 : 0000000000000000 x6 : 0000000000000000 [ 16.287203] x5 : ffff000019607ff0 x4 : ffff000004578000 x3 : ffff000019600000 [ 16.318145] x2 : 0000000000007ff0 x1 : ffff000004570010 x0 : ffff000019600000 [ 16.349071] Call trace: [ 16.360143] __pi_memcpy_generic+0x128/0x22c (P) [ 16.380310] swiotlb_tbl_map_single+0x154/0x2b4 [ 16.400282] swiotlb_map+0x5c/0x228 [ 16.415984] dma_map_phys+0x244/0x2b8 [ 16.432199] dma_map_page_attrs+0x44/0x58 [ 16.449782] virtqueue_map_page_attrs+0x38/0x44 [ 16.469596] virtqueue_map_single_attrs+0xc0/0x130 [ 16.490509] virtnet_rq_alloc.isra.0+0xa4/0x1fc [ 16.510355] try_fill_recv+0x2a4/0x584 [ 16.526989] virtnet_open+0xd4/0x238 [ 16.542775] __dev_open+0x110/0x24c [ 16.558280] __dev_change_flags+0x194/0x20c [ 16.576879] netif_change_flags+0x24/0x6c [ 16.594489] dev_change_flags+0x48/0x7c [ 16.611462] ip_auto_config+0x258/0x1114 [ 16.628727] do_one_initcall+0x80/0x1c8 [ 16.645590] kernel_init_freeable+0x208/0x2f0 [ 16.664917] kernel_init+0x24/0x1e0 [ 16.680295] ret_from_fork+0x10/0x20 [ 16.696369] Code: 927cec03 cb0e0021 8b0e0042 a9411c26 (a900340c) [ 16.723106] ---[ end trace 0000000000000000 ]--- [ 16.752866] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b [ 16.792556] Kernel Offset: 0x3396ea200000 from 0xffff800080000000 [ 16.818966] PHYS_OFFSET: 0xfff1000080000000 [ 16.837237] CPU features: 0x0000000,00060005,13e38581,957e772f [ 16.862904] Memory Limit: none [ 16.876526] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]--- This panic occurs because the swiotlb memory was previously shared to the host (__set_memory_enc_dec()), which involves transitioning the (large) leaf mappings to invalid, sharing to the host, then marking the mappings valid again. But pageattr_p[mu]d_entry() would only update the entry if it is a section mapping, since otherwise it concluded it must be a table entry so shouldn't be modified. But p[mu]d_sect() only returns true if the entry is valid. So the result was that the large leaf entry was made invalid in the first pass then ignored in the second pass. It remains invalid until the above code tries to access it and blows up. The simple fix would be to update pageattr_pmd_entry() to use !pmd_table() instead of pmd_sect(). That would solve this problem. But the ptdump code also suffers from a similar issue. It checks pmd_leaf() and doesn't call into the arch-specific note_page() machinery if it returns false. As a result of this, ptdump wasn't even able to show the invalid large leaf mappings; it looked like they were valid which made this super fun to debug. the ptdump code is core-mm and pmd_table() is arm64-specific so we can't use the same trick to solve that. But we already support the concept of "present-invalid" for user space entries. And even better, pmd_leaf() will return true for a leaf mapping that is marked present-invalid. So let's just use that encoding for present-invalid kernel mappings too. Then we can use pmd_leaf() where we previously used pmd_sect() and everything is magically fixed. Additionally, from inspection kernel_page_present() was broken in a similar way, so I'm also updating that to use pmd_leaf(). The transitional page tables component was also similarly broken; it creates a copy of the kernel page tables, making RO leaf mappings RW in the process. It also makes invalid (but-not-none) pte mappings valid. But it was not doing this for large leaf mappings. This could have resulted in crashes at kexec- or hibernate-time. This code is fixed to flip "present-invalid" mappings back to "present-valid" at all levels. Finally, I have hardened split_pmd()/split_pud() so that if it is passed a "present-invalid" leaf, it will maintain that property in the split leaves, since I wasn't able to convince myself that it would only ever be called for "present-valid" leaves. Fixes: a166563e7ec3 ("arm64: mm: support large block mapping when rodata=full") Cc: stable@vger.kernel.org Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-prot.h | 2 ++ arch/arm64/include/asm/pgtable.h | 9 +++-- arch/arm64/mm/mmu.c | 4 +++ arch/arm64/mm/pageattr.c | 50 +++++++++++++++------------ arch/arm64/mm/trans_pgd.c | 42 ++++------------------ 5 files changed, 48 insertions(+), 59 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index f560e6420267..212ce1b02e15 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -25,6 +25,8 @@ */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ +#define PTE_PRESENT_VALID_KERNEL (PTE_VALID | PTE_MAYBE_NG) + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ #define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b3e58735c49b..dd062179b9b6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -322,9 +322,11 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } -static inline pte_t pte_mkvalid(pte_t pte) +static inline pte_t pte_mkvalid_k(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_VALID)); + pte = clear_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID)); + pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_VALID_KERNEL)); + return pte; } static inline pte_t pte_mkinvalid(pte_t pte) @@ -594,6 +596,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) +#define pmd_mkvalid_k(pmd) pte_pmd(pte_mkvalid_k(pmd_pte(pmd))) #define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd))) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define pmd_uffd_wp(pmd) pte_uffd_wp(pmd_pte(pmd)) @@ -635,6 +638,8 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd) #define pud_young(pud) pte_young(pud_pte(pud)) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) +#define pud_mkwrite_novma(pud) pte_pud(pte_mkwrite_novma(pud_pte(pud))) +#define pud_mkvalid_k(pud) pte_pud(pte_mkvalid_k(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) static inline pud_t pud_mkhuge(pud_t pud) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 223947487a22..1575680675d8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -602,6 +602,8 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) tableprot |= PMD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); + if (!pmd_valid(pmd)) + prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot))); prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); if (to_cont) prot = __pgprot(pgprot_val(prot) | PTE_CONT); @@ -647,6 +649,8 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) tableprot |= PUD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); + if (!pud_valid(pud)) + prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot))); prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); if (to_cont) prot = __pgprot(pgprot_val(prot) | PTE_CONT); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 358d1dc9a576..ce035e1b4eaf 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -25,6 +25,11 @@ static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) { struct page_change_data *masks = walk->private; + /* + * Some users clear and set bits which alias each other (e.g. PTE_NG and + * PTE_PRESENT_INVALID). It is therefore important that we always clear + * first then set. + */ val &= ~(pgprot_val(masks->clear_mask)); val |= (pgprot_val(masks->set_mask)); @@ -36,7 +41,7 @@ static int pageattr_pud_entry(pud_t *pud, unsigned long addr, { pud_t val = pudp_get(pud); - if (pud_sect(val)) { + if (pud_leaf(val)) { if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) return -EINVAL; val = __pud(set_pageattr_masks(pud_val(val), walk)); @@ -52,7 +57,7 @@ static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, { pmd_t val = pmdp_get(pmd); - if (pmd_sect(val)) { + if (pmd_leaf(val)) { if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) return -EINVAL; val = __pmd(set_pageattr_masks(pmd_val(val), walk)); @@ -132,11 +137,12 @@ static int __change_memory_common(unsigned long start, unsigned long size, ret = update_range_prot(start, size, set_mask, clear_mask); /* - * If the memory is being made valid without changing any other bits - * then a TLBI isn't required as a non-valid entry cannot be cached in - * the TLB. + * If the memory is being switched from present-invalid to valid without + * changing any other bits then a TLBI isn't required as a non-valid + * entry cannot be cached in the TLB. */ - if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) + if (pgprot_val(set_mask) != PTE_PRESENT_VALID_KERNEL || + pgprot_val(clear_mask) != PTE_PRESENT_INVALID) flush_tlb_kernel_range(start, start + size); return ret; } @@ -237,18 +243,18 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) { if (enable) return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(PTE_VALID), - __pgprot(0)); + __pgprot(PTE_PRESENT_VALID_KERNEL), + __pgprot(PTE_PRESENT_INVALID)); else return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(0), - __pgprot(PTE_VALID)); + __pgprot(PTE_PRESENT_INVALID), + __pgprot(PTE_PRESENT_VALID_KERNEL)); } int set_direct_map_invalid_noflush(struct page *page) { - pgprot_t clear_mask = __pgprot(PTE_VALID); - pgprot_t set_mask = __pgprot(0); + pgprot_t clear_mask = __pgprot(PTE_PRESENT_VALID_KERNEL); + pgprot_t set_mask = __pgprot(PTE_PRESENT_INVALID); if (!can_set_direct_map()) return 0; @@ -259,8 +265,8 @@ int set_direct_map_invalid_noflush(struct page *page) int set_direct_map_default_noflush(struct page *page) { - pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); - pgprot_t clear_mask = __pgprot(PTE_RDONLY); + pgprot_t set_mask = __pgprot(PTE_PRESENT_VALID_KERNEL | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_PRESENT_INVALID | PTE_RDONLY); if (!can_set_direct_map()) return 0; @@ -296,8 +302,8 @@ static int __set_memory_enc_dec(unsigned long addr, * entries or Synchronous External Aborts caused by RIPAS_EMPTY */ ret = __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(set_prot), - __pgprot(clear_prot | PTE_VALID)); + __pgprot(set_prot | PTE_PRESENT_INVALID), + __pgprot(clear_prot | PTE_PRESENT_VALID_KERNEL)); if (ret) return ret; @@ -311,8 +317,8 @@ static int __set_memory_enc_dec(unsigned long addr, return ret; return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(PTE_VALID), - __pgprot(0)); + __pgprot(PTE_PRESENT_VALID_KERNEL), + __pgprot(PTE_PRESENT_INVALID)); } static int realm_set_memory_encrypted(unsigned long addr, int numpages) @@ -404,15 +410,15 @@ bool kernel_page_present(struct page *page) pud = READ_ONCE(*pudp); if (pud_none(pud)) return false; - if (pud_sect(pud)) - return true; + if (pud_leaf(pud)) + return pud_valid(pud); pmdp = pmd_offset(pudp, addr); pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) return false; - if (pmd_sect(pmd)) - return true; + if (pmd_leaf(pmd)) + return pmd_valid(pmd); ptep = pte_offset_kernel(pmdp, addr); return pte_valid(__ptep_get(ptep)); diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 18543b603c77..cca9706a875c 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -31,36 +31,6 @@ static void *trans_alloc(struct trans_pgd_info *info) return info->trans_alloc_page(info->trans_alloc_arg); } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = __ptep_get(src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - __set_pte(dst_ptep, pte_mkwrite_novma(pte)); - } else if (!pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Other cases include kfence / vmalloc / memfd_secret which - * may call `set_direct_map_invalid_noflush()`. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - __set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte))); - } -} - static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, unsigned long end) { @@ -76,7 +46,11 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, src_ptep = pte_offset_kernel(src_pmdp, start); do { - _copy_pte(dst_ptep, src_ptep, addr); + pte_t pte = __ptep_get(src_ptep); + + if (pte_none(pte)) + continue; + __set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte))); } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); return 0; @@ -109,8 +83,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, if (copy_pte(info, dst_pmdp, src_pmdp, addr, next)) return -ENOMEM; } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); + set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd))); } } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); @@ -145,8 +118,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, if (copy_pmd(info, dst_pudp, src_pudp, addr, next)) return -ENOMEM; } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); + set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud))); } } while (dst_pudp++, src_pudp++, addr = next, addr != end); From 1d37713fa83780f3f500fa4c4f6c43945dd17137 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 30 Mar 2026 17:17:04 +0100 Subject: [PATCH 087/109] arm64: mm: Remove pmd_sect() and pud_sect() The semantics of pXd_leaf() are very similar to pXd_sect(). The only difference is that pXd_sect() only considers it a section if PTE_VALID is set, whereas pXd_leaf() permits both "valid" and "present-invalid" types. Using pXd_sect() has caused issues now that large leaf entries can be present-invalid since commit a166563e7ec37 ("arm64: mm: support large block mapping when rodata=full"), so let's just remove the API and standardize on pXd_leaf(). There are a few callsites of the form pXd_leaf(READ_ONCE(*pXdp)). This was previously fine for the pXd_sect() macro because it only evaluated its argument once. But pXd_leaf() evaluates its argument multiple times. So let's avoid unintended side effects by reimplementing pXd_leaf() as an inline function. Signed-off-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 19 ++++++++++++------- arch/arm64/mm/mmu.c | 18 +++++++++--------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index dd062179b9b6..5bc42b85acfc 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -784,9 +784,13 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_TABLE) -#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ - PMD_TYPE_SECT) -#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd)) + +#define pmd_leaf pmd_leaf +static inline bool pmd_leaf(pmd_t pmd) +{ + return pmd_present(pmd) && !pmd_table(pmd); +} + #define pmd_bad(pmd) (!pmd_table(pmd)) #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE) @@ -804,11 +808,8 @@ static inline int pmd_trans_huge(pmd_t pmd) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 -static inline bool pud_sect(pud_t pud) { return false; } static inline bool pud_table(pud_t pud) { return true; } #else -#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ - PUD_TYPE_SECT) #define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ PUD_TYPE_TABLE) #endif @@ -878,7 +879,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) PUD_TYPE_TABLE) #define pud_present(pud) pte_present(pud_pte(pud)) #ifndef __PAGETABLE_PMD_FOLDED -#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) +#define pud_leaf pud_leaf +static inline bool pud_leaf(pud_t pud) +{ + return pud_present(pud) && !pud_table(pud); +} #else #define pud_leaf(pud) false #endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1575680675d8..dcee56bb622a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -204,7 +204,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, pmd_t pmd = READ_ONCE(*pmdp); pte_t *ptep; - BUG_ON(pmd_sect(pmd)); + BUG_ON(pmd_leaf(pmd)); if (pmd_none(pmd)) { pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; phys_addr_t pte_phys; @@ -303,7 +303,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, /* * Check for initial section mappings in the pgd/pud. */ - BUG_ON(pud_sect(pud)); + BUG_ON(pud_leaf(pud)); if (pud_none(pud)) { pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; phys_addr_t pmd_phys; @@ -1503,7 +1503,7 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, continue; WARN_ON(!pmd_present(pmd)); - if (pmd_sect(pmd)) { + if (pmd_leaf(pmd)) { pmd_clear(pmdp); /* @@ -1536,7 +1536,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, continue; WARN_ON(!pud_present(pud)); - if (pud_sect(pud)) { + if (pud_leaf(pud)) { pud_clear(pudp); /* @@ -1650,7 +1650,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr, if (pmd_none(pmd)) continue; - WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd)); + WARN_ON(!pmd_present(pmd) || !pmd_table(pmd)); free_empty_pte_table(pmdp, addr, next, floor, ceiling); } while (addr = next, addr < end); @@ -1690,7 +1690,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, if (pud_none(pud)) continue; - WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud)); + WARN_ON(!pud_present(pud) || !pud_table(pud)); free_empty_pmd_table(pudp, addr, next, floor, ceiling); } while (addr = next, addr < end); @@ -1786,7 +1786,7 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, { vmemmap_verify((pte_t *)pmdp, node, addr, next); - return pmd_sect(READ_ONCE(*pmdp)); + return pmd_leaf(READ_ONCE(*pmdp)); } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, @@ -1850,7 +1850,7 @@ void p4d_clear_huge(p4d_t *p4dp) int pud_clear_huge(pud_t *pudp) { - if (!pud_sect(READ_ONCE(*pudp))) + if (!pud_leaf(READ_ONCE(*pudp))) return 0; pud_clear(pudp); return 1; @@ -1858,7 +1858,7 @@ int pud_clear_huge(pud_t *pudp) int pmd_clear_huge(pmd_t *pmdp) { - if (!pmd_sect(READ_ONCE(*pmdp))) + if (!pmd_leaf(READ_ONCE(*pmdp))) return 0; pmd_clear(pmdp); return 1; From 85b6f920a8691d96441da9da7fc55ec93b906fe6 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 6 Mar 2026 17:00:53 +0000 Subject: [PATCH 088/109] arm64/sysreg: Update SMIDR_EL1 to DDI0601 2025-06 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the definition of SMIDR_EL1 in the sysreg definition to reflect the information in DD0601 2025-06. This includes somewhat more generic ways of describing the sharing of SMCUs, more information on supported priorities and provides additional resolution for describing affinity groups. Reviewed-by: Fuad Tabba Signed-off-by: Mark Brown Acked-by: Catalin Marinas Reviewed-by: Alex Bennée Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 9d1c21108057..b6586accf344 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3655,11 +3655,15 @@ Field 3:0 BS EndSysreg Sysreg SMIDR_EL1 3 1 0 0 6 -Res0 63:32 +Res0 63:60 +Field 59:56 NSMC +Field 55:52 HIP +Field 51:32 AFFINITY2 Field 31:24 IMPLEMENTER Field 23:16 REVISION Field 15 SMPS -Res0 14:12 +Field 14:13 SH +Res0 12 Field 11:0 AFFINITY EndSysreg From 1f0d117cd6ca8e74e70e415e89b059fce37674c6 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:41 +0100 Subject: [PATCH 089/109] entry: Fix stale comment for irqentry_enter() The kerneldoc comment for irqentry_enter() refers to idtentry_exit(), which is an accidental holdover from the x86 entry code that the generic irqentry code was based on. Correct this to refer to irqentry_exit(). Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-2-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d26d1b1bcbfb..3cf4d21168ba 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -394,7 +394,7 @@ typedef struct irqentry_state { * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit * would not be possible. * - * Returns: An opaque object that must be passed to idtentry_exit() + * Returns: An opaque object that must be passed to irqentry_exit() */ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); From 22f66e7ef4ce9414b4bd18abe50ead4a1284b01a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:42 +0100 Subject: [PATCH 090/109] entry: Remove local_irq_{enable,disable}_exit_to_user() local_irq_enable_exit_to_user() and local_irq_disable_exit_to_user() are never overridden by architecture code, and are always equivalent to local_irq_enable() and local_irq_disable(). These functions were added on the assumption that arm64 would override them to manage 'DAIF' exception masking, as described by Thomas Gleixner in these threads: https://lore.kernel.org/all/20190919150809.340471236@linutronix.de/ https://lore.kernel.org/all/alpine.DEB.2.21.1910240119090.1852@nanos.tec.linutronix.de/ In practice arm64 did not need to override either. Prior to moving to the generic irqentry code, arm64's management of DAIF was reworked in commit: 97d935faacde ("arm64: Unmask Debug + SError in do_notify_resume()") Since that commit, arm64 only masks interrupts during the 'prepare' step when returning to user mode, and masks other DAIF exceptions later. Within arm64_exit_to_user_mode(), the arm64 entry code is as follows: local_irq_disable(); exit_to_user_mode_prepare_legacy(regs); local_daif_mask(); mte_check_tfsr_exit(); exit_to_user_mode(); Remove the unnecessary local_irq_enable_exit_to_user() and local_irq_disable_exit_to_user() functions. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-3-mark.rutland@arm.com --- include/linux/entry-common.h | 2 +- include/linux/irq-entry-common.h | 31 ------------------------------- kernel/entry/common.c | 4 ++-- 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index f83ca0abf2cd..dbaa153100f4 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -321,7 +321,7 @@ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); syscall_exit_to_user_mode_work(regs); - local_irq_disable_exit_to_user(); + local_irq_disable(); syscall_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 3cf4d21168ba..93b4b551f7ae 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -100,37 +100,6 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) instrumentation_end(); } -/** - * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() - * @ti_work: Cached TIF flags gathered with interrupts disabled - * - * Defaults to local_irq_enable(). Can be supplied by architecture specific - * code. - */ -static inline void local_irq_enable_exit_to_user(unsigned long ti_work); - -#ifndef local_irq_enable_exit_to_user -static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work) -{ - local_irq_enable(); -} -#endif - -/** - * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() - * - * Defaults to local_irq_disable(). Can be supplied by architecture specific - * code. - */ -static inline void local_irq_disable_exit_to_user(void); - -#ifndef local_irq_disable_exit_to_user -static __always_inline void local_irq_disable_exit_to_user(void) -{ - local_irq_disable(); -} -#endif - /** * arch_exit_to_user_mode_work - Architecture specific TIF work for exit * to user mode. diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9ef63e414791..b5e05d87ba39 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,7 +47,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re */ while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { - local_irq_enable_exit_to_user(ti_work); + local_irq_enable(); if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) @@ -74,7 +74,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re * might have changed while interrupts and preemption was * enabled above. */ - local_irq_disable_exit_to_user(); + local_irq_disable(); /* Check if any of the above work has queued a deferred wakeup */ tick_nohz_user_enter_prepare(); From eb1b51afde506a8e38976190e518990d69ef5382 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:43 +0100 Subject: [PATCH 091/109] entry: Move irqentry_enter() prototype later Subsequent patches will rework the irqentry_*() functions. The end result (and the intermediate diffs) will be much clearer if the prototype for the irqentry_enter() function is moved later, immediately before the prototype of the irqentry_exit() function. Move the prototype later. This is purely a move; there should be no functional change as a result of this change. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-4-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 44 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 93b4b551f7ae..d1e8591a5919 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -334,6 +334,28 @@ typedef struct irqentry_state { } irqentry_state_t; #endif +/** + * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt + * + * Conditional reschedule with additional sanity checks. + */ +void raw_irqentry_exit_cond_resched(void); + +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched +#define irqentry_exit_cond_resched_dynamic_disabled NULL +DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void); +#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() +#endif +#else /* CONFIG_PREEMPT_DYNAMIC */ +#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() +#endif /* CONFIG_PREEMPT_DYNAMIC */ + /** * irqentry_enter - Handle state tracking on ordinary interrupt entries * @regs: Pointer to pt_regs of interrupted context @@ -367,28 +389,6 @@ typedef struct irqentry_state { */ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); -/** - * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt - * - * Conditional reschedule with additional sanity checks. - */ -void raw_irqentry_exit_cond_resched(void); - -#ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched -#define irqentry_exit_cond_resched_dynamic_disabled NULL -DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); -#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -void dynamic_irqentry_exit_cond_resched(void); -#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() -#endif -#else /* CONFIG_PREEMPT_DYNAMIC */ -#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() -#endif /* CONFIG_PREEMPT_DYNAMIC */ - /** * irqentry_exit - Handle return from exception that used irqentry_enter() * @regs: Pointer to pt_regs (exception entry regs) From c5538d0141b383808f440186fcd0bc2799af2853 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:44 +0100 Subject: [PATCH 092/109] entry: Split kernel mode logic from irqentry_{enter,exit}() The generic irqentry code has entry/exit functions specifically for exceptions taken from user mode, but doesn't have entry/exit functions specifically for exceptions taken from kernel mode. It would be helpful to have separate entry/exit functions specifically for exceptions taken from kernel mode. This would make the structure of the entry code more consistent, and would make it easier for architectures to manage logic specific to exceptions taken from kernel mode. Move the logic specific to kernel mode out of irqentry_enter() and irqentry_exit() into new irqentry_enter_from_kernel_mode() and irqentry_exit_to_kernel_mode() functions. These are marked __always_inline and placed in irq-entry-common.h, as with irqentry_enter_from_user_mode() and irqentry_exit_to_user_mode(), so that they can be inlined into architecture-specific wrappers. The existing out-of-line irqentry_enter() and irqentry_exit() functions retained as callers of the new functions. The lockdep assertion from irqentry_exit() is moved into irqentry_exit_to_user_mode() and irqentry_exit_to_kernel_mode(). This was previously missing from irqentry_exit_to_user_mode() when called directly, and any new lockdep assertion failure relating from this change is a latent bug. Aside from the lockdep change noted above, there should be no functional change as a result of this change. [ tglx: Updated kernel doc ] Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-5-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 134 +++++++++++++++++++++++++++++++ kernel/entry/common.c | 103 ++---------------------- 2 files changed, 142 insertions(+), 95 deletions(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d1e8591a5919..66bc168bc6b5 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -304,6 +304,8 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) */ static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) { + lockdep_assert_irqs_disabled(); + instrumentation_begin(); irqentry_exit_to_user_mode_prepare(regs); instrumentation_end(); @@ -356,6 +358,138 @@ void dynamic_irqentry_exit_cond_resched(void); #define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() #endif /* CONFIG_PREEMPT_DYNAMIC */ +/** + * irqentry_enter_from_kernel_mode - Establish state before invoking the irq handler + * @regs: Pointer to currents pt_regs + * + * Invoked from architecture specific entry code with interrupts disabled. + * Can only be called when the interrupt entry came from kernel mode. The + * calling code must be non-instrumentable. When the function returns all + * state is correct and the subsequent functions can be instrumented. + * + * The function establishes state (lockdep, RCU (context tracking), tracing) and + * is provided for architectures which require a strict split between entry from + * kernel and user mode and therefore cannot use irqentry_enter() which handles + * both entry modes. + * + * Returns: An opaque object that must be passed to irqentry_exit_to_kernel_mode(). + */ +static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct pt_regs *regs) +{ + irqentry_state_t ret = { + .exit_rcu = false, + }; + + /* + * If this entry hit the idle task invoke ct_irq_enter() whether + * RCU is watching or not. + * + * Interrupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for rcu_is_watching() here would prevent the nesting + * interrupt to invoke ct_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interrupt and eventually claim + * quiescent state and end grace periods prematurely. + * + * Unconditionally invoke ct_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in irqentry_enter_from_user_mode(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + ct_irq_enter(); + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + trace_hardirqs_off_finish(); + instrumentation_end(); + + ret.exit_rcu = true; + return ret; + } + + /* + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + rcu_irq_enter_check_tick(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + return ret; +} + +/** + * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after + * invoking the interrupt handler + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is the counterpart of irqentry_enter_from_kernel_mode() and runs the + * necessary preemption check if possible and required. It returns to the caller + * with interrupts disabled and the correct state vs. tracing, lockdep and RCU + * required to return to the interrupted context. + * + * It is the last action before returning to the low level ASM code which just + * needs to return. + */ +static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + if (!regs_irqs_disabled(regs)) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (state.exit_rcu) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + ct_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (state.exit_rcu) + ct_irq_exit(); + } +} + /** * irqentry_enter - Handle state tracking on ordinary interrupt entries * @regs: Pointer to pt_regs of interrupted context diff --git a/kernel/entry/common.c b/kernel/entry/common.c index b5e05d87ba39..1034be02eae8 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -105,70 +105,16 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { - irqentry_state_t ret = { - .exit_rcu = false, - }; - if (user_mode(regs)) { + irqentry_state_t ret = { + .exit_rcu = false, + }; + irqentry_enter_from_user_mode(regs); return ret; } - /* - * If this entry hit the idle task invoke ct_irq_enter() whether - * RCU is watching or not. - * - * Interrupts can nest when the first interrupt invokes softirq - * processing on return which enables interrupts. - * - * Scheduler ticks in the idle task can mark quiescent state and - * terminate a grace period, if and only if the timer interrupt is - * not nested into another interrupt. - * - * Checking for rcu_is_watching() here would prevent the nesting - * interrupt to invoke ct_irq_enter(). If that nested interrupt is - * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interrupt and eventually claim - * quiescent state and end grace periods prematurely. - * - * Unconditionally invoke ct_irq_enter() so RCU state stays - * consistent. - * - * TINY_RCU does not support EQS, so let the compiler eliminate - * this part when enabled. - */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && - (is_idle_task(current) || arch_in_rcu_eqs())) { - /* - * If RCU is not watching then the same careful - * sequence vs. lockdep and tracing is required - * as in irqentry_enter_from_user_mode(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - trace_hardirqs_off_finish(); - instrumentation_end(); - - ret.exit_rcu = true; - return ret; - } - - /* - * If RCU is watching then RCU only wants to check whether it needs - * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() - * already contains a warning when RCU is not watching, so no point - * in having another one here. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); - instrumentation_end(); - - return ret; + return irqentry_enter_from_kernel_mode(regs); } /** @@ -212,43 +158,10 @@ void dynamic_irqentry_exit_cond_resched(void) noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { - lockdep_assert_irqs_disabled(); - - /* Check whether this returns to user mode */ - if (user_mode(regs)) { + if (user_mode(regs)) irqentry_exit_to_user_mode(regs); - } else if (!regs_irqs_disabled(regs)) { - /* - * If RCU was not watching on entry this needs to be done - * carefully and needs the same ordering of lockdep/tracing - * and RCU as the return to user mode path. - */ - if (state.exit_rcu) { - instrumentation_begin(); - /* Tell the tracer that IRET will enable interrupts */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - instrumentation_end(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); - instrumentation_end(); - } else { - /* - * IRQ flags state is correct already. Just tell RCU if it - * was not watching on entry. - */ - if (state.exit_rcu) - ct_irq_exit(); - } + else + irqentry_exit_to_kernel_mode(regs, state); } irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) From 041aa7a85390c99b1de86dc28eddcff0890d8186 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:45 +0100 Subject: [PATCH 093/109] entry: Split preemption from irqentry_exit_to_kernel_mode() Some architecture-specific work needs to be performed between the state management for exception entry/exit and the "real" work to handle the exception. For example, arm64 needs to manipulate a number of exception masking bits, with different exceptions requiring different masking. Generally this can all be hidden in the architecture code, but for arm64 the current structure of irqentry_exit_to_kernel_mode() makes this particularly difficult to handle in a way that is correct, maintainable, and efficient. The gory details are described in the thread surrounding: https://lore.kernel.org/lkml/acPAzdtjK5w-rNqC@J2N7QTR9R3/ The summary is: * Currently, irqentry_exit_to_kernel_mode() handles both involuntary preemption AND state management necessary for exception return. * When scheduling (including involuntary preemption), arm64 needs to have all arm64-specific exceptions unmasked, though regular interrupts must be masked. * Prior to the state management for exception return, arm64 needs to mask a number of arm64-specific exceptions, and perform some work with these exceptions masked (with RCU watching, etc). While in theory it is possible to handle this with a new arch_*() hook called somewhere under irqentry_exit_to_kernel_mode(), this is fragile and complicated, and doesn't match the flow used for exception return to user mode, which has a separate 'prepare' step (where preemption can occur) prior to the state management. To solve this, refactor irqentry_exit_to_kernel_mode() to match the style of {irqentry,syscall}_exit_to_user_mode(), moving preemption logic into a new irqentry_exit_to_kernel_mode_preempt() function, and moving state management in a new irqentry_exit_to_kernel_mode_after_preempt() function. The existing irqentry_exit_to_kernel_mode() is left as a caller of both of these, avoiding the need to modify existing callers. There should be no functional change as a result of this change. [ tglx: Updated kernel doc ] Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407131650.3813777-6-mark.rutland@arm.com --- include/linux/irq-entry-common.h | 73 ++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 66bc168bc6b5..384520217bfe 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -438,24 +438,46 @@ static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct p } /** - * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after - * invoking the interrupt handler + * irqentry_exit_to_kernel_mode_preempt - Run preempt checks on return to kernel mode * @regs: Pointer to current's pt_regs * @state: Return value from matching call to irqentry_enter_from_kernel_mode() * - * This is the counterpart of irqentry_enter_from_kernel_mode() and runs the - * necessary preemption check if possible and required. It returns to the caller - * with interrupts disabled and the correct state vs. tracing, lockdep and RCU - * required to return to the interrupted context. + * This is to be invoked before irqentry_exit_to_kernel_mode_after_preempt() to + * allow kernel preemption on return from interrupt. * - * It is the last action before returning to the low level ASM code which just - * needs to return. + * Must be invoked with interrupts disabled and CPU state which allows kernel + * preemption. + * + * After returning from this function, the caller can modify CPU state before + * invoking irqentry_exit_to_kernel_mode_after_preempt(), which is required to + * re-establish the tracing, lockdep and RCU state for returning to the + * interrupted context. */ -static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, - irqentry_state_t state) +static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs, + irqentry_state_t state) { - lockdep_assert_irqs_disabled(); + if (regs_irqs_disabled(regs) || state.exit_rcu) + return; + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); +} + +/** + * irqentry_exit_to_kernel_mode_after_preempt - Establish trace, lockdep and RCU state + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is to be invoked after irqentry_exit_to_kernel_mode_preempt() and before + * actually returning to the interrupted context. + * + * There are no requirements for the CPU state other than being able to complete + * the tracing, lockdep and RCU state transitions. After this function returns + * the caller must return directly to the interrupted context. + */ +static __always_inline void +irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_t state) +{ if (!regs_irqs_disabled(regs)) { /* * If RCU was not watching on entry this needs to be done @@ -474,9 +496,6 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); @@ -490,6 +509,32 @@ static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, } } +/** + * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after + * invoking the interrupt handler + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is the counterpart of irqentry_enter_from_kernel_mode() and combines + * the calls to irqentry_exit_to_kernel_mode_preempt() and + * irqentry_exit_to_kernel_mode_after_preempt(). + * + * The requirement for the CPU state is that it can schedule. After the function + * returns the tracing, lockdep and RCU state transitions are completed and the + * caller must return directly to the interrupted context. + */ +static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + instrumentation_begin(); + irqentry_exit_to_kernel_mode_preempt(regs, state); + instrumentation_end(); + + irqentry_exit_to_kernel_mode_after_preempt(regs, state); +} + /** * irqentry_enter - Handle state tracking on ordinary interrupt entries * @regs: Pointer to pt_regs of interrupted context From 2371bd83b3df9d833191fe58dadb0e69a794a1cd Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:46 +0100 Subject: [PATCH 094/109] arm64: entry: Don't preempt with SError or Debug masked On arm64, involuntary kernel preemption has been subtly broken since the move to the generic irqentry code. When preemption occurs, the new task may run with SError and Debug exceptions masked unexpectedly, leading to a loss of RAS events, breakpoints, watchpoints, and single-step exceptions. Prior to moving to the generic irqentry code, involuntary preemption of kernel mode would only occur when returning from regular interrupts, in a state where interrupts were masked and all other arm64-specific exceptions (SError, Debug, and pseudo-NMI) were unmasked. This is the only state in which it is valid to switch tasks. As part of moving to the generic irqentry code, the involuntary preemption logic was moved such that involuntary preemption could occur when returning from any (non-NMI) exception. As most exception handlers mask all arm64-specific exceptions before this point, preemption could occur in a state where arm64-specific exceptions were masked. This is not a valid state to switch tasks, and resulted in the loss of exceptions described above. As a temporary bodge, avoid the loss of exceptions by avoiding involuntary preemption when SError and/or Debug exceptions are masked. Practically speaking this means that involuntary preemption will only occur when returning from regular interrupts, as was the case before moving to the generic irqentry code. Fixes: 99eb057ccd67 ("arm64: entry: Move arm64_preempt_schedule_irq() into __exit_to_kernel_mode()") Reported-by: Ada Couprie Diaz Reported-by: Vladimir Murzin Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Jinjie Ruan Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/entry-common.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/entry-common.h b/arch/arm64/include/asm/entry-common.h index cab8cd78f693..20f0a7c7bde1 100644 --- a/arch/arm64/include/asm/entry-common.h +++ b/arch/arm64/include/asm/entry-common.h @@ -29,14 +29,19 @@ static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, static inline bool arch_irqentry_exit_need_resched(void) { - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return false; + if (system_uses_irq_prio_masking()) { + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (read_sysreg(daif)) + return false; + } else { + if (read_sysreg(daif) & (PSR_D_BIT | PSR_A_BIT)) + return false; + } /* * Preempting a task from an IRQ means we leave copies of PSTATE From 6879ef130223f015c9a5a8a0d14d3f6d0464fa21 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:47 +0100 Subject: [PATCH 095/109] arm64: entry: Consistently prefix arm64-specific wrappers For historical reasons, arm64's entry code has arm64-specific functions named enter_from_kernel_mode() and exit_to_kernel_mode(), which are wrappers for similarly-named functions from the generic irqentry code. Other arm64-specific wrappers have an 'arm64_' prefix to clearly distinguish them from their generic counterparts, e.g. arm64_enter_from_user_mode() and arm64_exit_to_user_mode(). For consistency and clarity, add an 'arm64_' prefix to these functions. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Jinjie Ruan Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Murzin Cc: Will Deacon Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry-common.c | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3625797e9ee8..3d01cdacdc7a 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -35,7 +35,7 @@ * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *regs) { irqentry_state_t state; @@ -51,8 +51,8 @@ static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static void noinstr exit_to_kernel_mode(struct pt_regs *regs, - irqentry_state_t state) +static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) { mte_check_tfsr_exit(); irqentry_exit(regs, state); @@ -298,11 +298,11 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) unsigned long far = read_sysreg(far_el1); irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) @@ -310,55 +310,55 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) unsigned long far = read_sysreg(far_el1); irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_gcs(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_mops(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) @@ -420,11 +420,11 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) @@ -491,13 +491,13 @@ static __always_inline void __el1_irq(struct pt_regs *regs, { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); irq_exit_rcu(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) From a07b7b214240e1bf3de7067f2f43d88aa8e50c28 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:48 +0100 Subject: [PATCH 096/109] arm64: entry: Use irqentry_{enter_from,exit_to}_kernel_mode() The generic irqentry code now provides irqentry_enter_from_kernel_mode() and irqentry_exit_to_kernel_mode(), which can be used when an exception is known to be taken from kernel mode. These can be inlined into architecture-specific entry code, and avoid redundant work to test whether the exception was taken from user mode. Use these in arm64_enter_from_kernel_mode() and arm64_exit_to_kernel_mode(), which are only used for exceptions known to be taken from kernel mode. This will remove a small amount of redundant work, and will permit further changes to arm64_exit_to_kernel_mode() in subsequent patches. There should be no funcitonal change as a result of this patch. Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Jinjie Ruan Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Murzin Cc: Will Deacon Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3d01cdacdc7a..16a65987a6a9 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -39,7 +39,7 @@ static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *reg { irqentry_state_t state; - state = irqentry_enter(regs); + state = irqentry_enter_from_kernel_mode(regs); mte_check_tfsr_entry(); mte_disable_tco_entry(current); @@ -55,7 +55,7 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, irqentry_state_t state) { mte_check_tfsr_exit(); - irqentry_exit(regs, state); + irqentry_exit_to_kernel_mode(regs, state); } /* From ae654112eac05f316ef31587fc55e4d7160d0086 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:49 +0100 Subject: [PATCH 097/109] arm64: entry: Use split preemption logic The generic irqentry code now provides irqentry_exit_to_kernel_mode_preempt() and irqentry_exit_to_kernel_mode_after_preempt(), which can be used where architectures have different state requirements for involuntary preemption and exception return, as is the case on arm64. Use the new functions on arm64, aligning our exit to kernel mode logic with the style of our exit to user mode logic. This removes the need for the recently-added bodge in arch_irqentry_exit_need_resched(), and allows preemption to occur when returning from any exception taken from kernel mode, which is nicer for RT. In an ideal world, we'd remove arch_irqentry_exit_need_resched(), and fold the conditionality directly into the architecture-specific entry code. That way all the logic necessary to avoid preempting from a pseudo-NMI could be constrained specifically to the EL1 IRQ/FIQ paths, avoiding redundant work for other exceptions, and making the flow a bit clearer. At present it looks like that would require a larger refactoring (e.g. for the PREEMPT_DYNAMIC logic), and so I've left that as-is for now. Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Jinjie Ruan Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Murzin Cc: Will Deacon Reviewed-by: Jinjie Ruan Acked-by: Peter Zijlstra (Intel) Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/entry-common.h | 21 ++++++++------------- arch/arm64/kernel/entry-common.c | 12 ++++-------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/entry-common.h b/arch/arm64/include/asm/entry-common.h index 20f0a7c7bde1..cab8cd78f693 100644 --- a/arch/arm64/include/asm/entry-common.h +++ b/arch/arm64/include/asm/entry-common.h @@ -29,19 +29,14 @@ static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, static inline bool arch_irqentry_exit_need_resched(void) { - if (system_uses_irq_prio_masking()) { - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (read_sysreg(daif)) - return false; - } else { - if (read_sysreg(daif) & (PSR_D_BIT | PSR_A_BIT)) - return false; - } + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return false; /* * Preempting a task from an IRQ means we leave copies of PSTATE diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 16a65987a6a9..f42ce7b5c67f 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -54,8 +54,11 @@ static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *reg static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, irqentry_state_t state) { + local_irq_disable(); + irqentry_exit_to_kernel_mode_preempt(regs, state); + local_daif_mask(); mte_check_tfsr_exit(); - irqentry_exit_to_kernel_mode(regs, state); + irqentry_exit_to_kernel_mode_after_preempt(regs, state); } /* @@ -301,7 +304,6 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); - local_daif_mask(); arm64_exit_to_kernel_mode(regs, state); } @@ -313,7 +315,6 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); - local_daif_mask(); arm64_exit_to_kernel_mode(regs, state); } @@ -324,7 +325,6 @@ static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); - local_daif_mask(); arm64_exit_to_kernel_mode(regs, state); } @@ -335,7 +335,6 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); - local_daif_mask(); arm64_exit_to_kernel_mode(regs, state); } @@ -346,7 +345,6 @@ static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_gcs(regs, esr); - local_daif_mask(); arm64_exit_to_kernel_mode(regs, state); } @@ -357,7 +355,6 @@ static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_mops(regs, esr); - local_daif_mask(); arm64_exit_to_kernel_mode(regs, state); } @@ -423,7 +420,6 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); - local_daif_mask(); arm64_exit_to_kernel_mode(regs, state); } From 8d13386c7624dc8bd3caad483875fb9be4044ea0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 7 Apr 2026 14:16:50 +0100 Subject: [PATCH 098/109] arm64: Check DAIF (and PMR) at task-switch time When __switch_to() switches from a 'prev' task to a 'next' task, various pieces of CPU state are expected to have specific values, such that these do not need to be saved/restored. If any of these hold an unexpected value when switching away from the prev task, they could lead to surprising behaviour in the context of the next task, and it would be difficult to determine where they were configured to their unexpected value. Add some checks for DAIF and PMR at task-switch time so that we can detect such issues. Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Jinjie Ruan Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Murzin Cc: Will Deacon Acked-by: Peter Zijlstra (Intel) Signed-off-by: Catalin Marinas --- arch/arm64/kernel/process.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 489554931231..ba9038434d2f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -699,6 +699,29 @@ void update_sctlr_el1(u64 sctlr) isb(); } +static inline void debug_switch_state(void) +{ + if (system_uses_irq_prio_masking()) { + unsigned long daif_expected = 0; + unsigned long daif_actual = read_sysreg(daif); + unsigned long pmr_expected = GIC_PRIO_IRQOFF; + unsigned long pmr_actual = read_sysreg_s(SYS_ICC_PMR_EL1); + + WARN_ONCE(daif_actual != daif_expected || + pmr_actual != pmr_expected, + "Unexpected DAIF + PMR: 0x%lx + 0x%lx (expected 0x%lx + 0x%lx)\n", + daif_actual, pmr_actual, + daif_expected, pmr_expected); + } else { + unsigned long daif_expected = DAIF_PROCCTX_NOIRQ; + unsigned long daif_actual = read_sysreg(daif); + + WARN_ONCE(daif_actual != daif_expected, + "Unexpected DAIF value: 0x%lx (expected 0x%lx)\n", + daif_actual, daif_expected); + } +} + /* * Thread switching. */ @@ -708,6 +731,8 @@ struct task_struct *__switch_to(struct task_struct *prev, { struct task_struct *last; + debug_switch_state(); + fpsimd_thread_switch(next); tls_thread_switch(next); hw_breakpoint_thread_switch(next); From b178330b67abb7293b6de28b2a49d49c83962db5 Mon Sep 17 00:00:00 2001 From: Haoyu Lu Date: Tue, 7 Apr 2026 11:31:15 +0800 Subject: [PATCH 099/109] ACPI: AGDI: fix missing newline in error message Add the missing trailing newline to the dev_err() message printed when SDEI event registration fails. This keeps the error output as a properly terminated log line. Fixes: a2a591fb76e6 ("ACPI: AGDI: Add driver for Arm Generic Diagnostic Dump and Reset device") Reviewed-by: Ilkka Koskinen Signed-off-by: Haoyu Lu Reviewed-by: Hanjun Guo Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/agdi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/arm64/agdi.c b/drivers/acpi/arm64/agdi.c index feb4b2cb4618..0c2d9d6c160b 100644 --- a/drivers/acpi/arm64/agdi.c +++ b/drivers/acpi/arm64/agdi.c @@ -36,7 +36,7 @@ static int agdi_sdei_probe(struct platform_device *pdev, err = sdei_event_register(adata->sdei_event, agdi_sdei_handler, pdev); if (err) { - dev_err(&pdev->dev, "Failed to register for SDEI event %d", + dev_err(&pdev->dev, "Failed to register for SDEI event %d\n", adata->sdei_event); return err; } From ee020bf6f14094c9ae434bb37e6957a1fdad513c Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Sun, 5 Apr 2026 19:42:31 +0800 Subject: [PATCH 100/109] arm64: kexec: Remove duplicate allocation for trans_pgd trans_pgd would be allocated in trans_pgd_create_copy(), so remove the duplicate allocation before calling trans_pgd_create_copy(). Fixes: 3744b5280e67 ("arm64: kexec: install a copy of the linear-map") Signed-off-by: Wang Wensheng Reviewed-by: Pasha Tatashin Signed-off-by: Catalin Marinas --- arch/arm64/kernel/machine_kexec.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 239c16e3d02f..c5693a32e49b 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -129,9 +129,6 @@ int machine_kexec_post_load(struct kimage *kimage) } /* Create a copy of the linear map */ - trans_pgd = kexec_page_alloc(kimage); - if (!trans_pgd) - return -ENOMEM; rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END); if (rc) return rc; From abed23c3c44f565dc812563ac015be70dd61e97b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 2 Mar 2026 22:53:16 +0000 Subject: [PATCH 101/109] arm64/hwcap: Generate the KERNEL_HWCAP_ definitions for the hwcaps Currently for each hwcap we define both the HWCAPn_NAME definition which is exposed to userspace and a kernel internal KERNEL_HWCAP_NAME definition which we use internally. This is tedious and repetitive, instead use a script to generate the KERNEL_HWCAP_ definitions from the UAPI definitions. No functional changes intended. Signed-off-by: Mark Brown Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hwcap.h | 120 +------------------------- arch/arm64/tools/Makefile | 8 +- arch/arm64/tools/gen-kernel-hwcaps.sh | 23 +++++ 3 files changed, 32 insertions(+), 119 deletions(-) create mode 100644 arch/arm64/tools/gen-kernel-hwcaps.sh diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 72ea4bda79f3..abe8218b2325 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -60,126 +60,10 @@ * of KERNEL_HWCAP_{feature}. */ #define __khwcap_feature(x) const_ilog2(HWCAP_ ## x) -#define KERNEL_HWCAP_FP __khwcap_feature(FP) -#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD) -#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM) -#define KERNEL_HWCAP_AES __khwcap_feature(AES) -#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL) -#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1) -#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2) -#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32) -#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS) -#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP) -#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP) -#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID) -#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM) -#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT) -#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA) -#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC) -#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP) -#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3) -#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3) -#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4) -#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP) -#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512) -#define KERNEL_HWCAP_SVE __khwcap_feature(SVE) -#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM) -#define KERNEL_HWCAP_DIT __khwcap_feature(DIT) -#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT) -#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC) -#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM) -#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS) -#define KERNEL_HWCAP_SB __khwcap_feature(SB) -#define KERNEL_HWCAP_PACA __khwcap_feature(PACA) -#define KERNEL_HWCAP_PACG __khwcap_feature(PACG) -#define KERNEL_HWCAP_GCS __khwcap_feature(GCS) -#define KERNEL_HWCAP_CMPBR __khwcap_feature(CMPBR) -#define KERNEL_HWCAP_FPRCVT __khwcap_feature(FPRCVT) -#define KERNEL_HWCAP_F8MM8 __khwcap_feature(F8MM8) -#define KERNEL_HWCAP_F8MM4 __khwcap_feature(F8MM4) -#define KERNEL_HWCAP_SVE_F16MM __khwcap_feature(SVE_F16MM) -#define KERNEL_HWCAP_SVE_ELTPERM __khwcap_feature(SVE_ELTPERM) -#define KERNEL_HWCAP_SVE_AES2 __khwcap_feature(SVE_AES2) -#define KERNEL_HWCAP_SVE_BFSCALE __khwcap_feature(SVE_BFSCALE) -#define KERNEL_HWCAP_SVE2P2 __khwcap_feature(SVE2P2) -#define KERNEL_HWCAP_SME2P2 __khwcap_feature(SME2P2) -#define KERNEL_HWCAP_SME_SBITPERM __khwcap_feature(SME_SBITPERM) -#define KERNEL_HWCAP_SME_AES __khwcap_feature(SME_AES) -#define KERNEL_HWCAP_SME_SFEXPA __khwcap_feature(SME_SFEXPA) -#define KERNEL_HWCAP_SME_STMOP __khwcap_feature(SME_STMOP) -#define KERNEL_HWCAP_SME_SMOP4 __khwcap_feature(SME_SMOP4) - #define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64) -#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP) -#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2) -#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES) -#define KERNEL_HWCAP_SVEPMULL __khwcap2_feature(SVEPMULL) -#define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM) -#define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) -#define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) -#define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) -#define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT) -#define KERNEL_HWCAP_SVEI8MM __khwcap2_feature(SVEI8MM) -#define KERNEL_HWCAP_SVEF32MM __khwcap2_feature(SVEF32MM) -#define KERNEL_HWCAP_SVEF64MM __khwcap2_feature(SVEF64MM) -#define KERNEL_HWCAP_SVEBF16 __khwcap2_feature(SVEBF16) -#define KERNEL_HWCAP_I8MM __khwcap2_feature(I8MM) -#define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16) -#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) -#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) -#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) -#define KERNEL_HWCAP_MTE __khwcap2_feature(MTE) -#define KERNEL_HWCAP_ECV __khwcap2_feature(ECV) -#define KERNEL_HWCAP_AFP __khwcap2_feature(AFP) -#define KERNEL_HWCAP_RPRES __khwcap2_feature(RPRES) -#define KERNEL_HWCAP_MTE3 __khwcap2_feature(MTE3) -#define KERNEL_HWCAP_SME __khwcap2_feature(SME) -#define KERNEL_HWCAP_SME_I16I64 __khwcap2_feature(SME_I16I64) -#define KERNEL_HWCAP_SME_F64F64 __khwcap2_feature(SME_F64F64) -#define KERNEL_HWCAP_SME_I8I32 __khwcap2_feature(SME_I8I32) -#define KERNEL_HWCAP_SME_F16F32 __khwcap2_feature(SME_F16F32) -#define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32) -#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32) -#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64) -#define KERNEL_HWCAP_WFXT __khwcap2_feature(WFXT) -#define KERNEL_HWCAP_EBF16 __khwcap2_feature(EBF16) -#define KERNEL_HWCAP_SVE_EBF16 __khwcap2_feature(SVE_EBF16) -#define KERNEL_HWCAP_CSSC __khwcap2_feature(CSSC) -#define KERNEL_HWCAP_RPRFM __khwcap2_feature(RPRFM) -#define KERNEL_HWCAP_SVE2P1 __khwcap2_feature(SVE2P1) -#define KERNEL_HWCAP_SME2 __khwcap2_feature(SME2) -#define KERNEL_HWCAP_SME2P1 __khwcap2_feature(SME2P1) -#define KERNEL_HWCAP_SME_I16I32 __khwcap2_feature(SME_I16I32) -#define KERNEL_HWCAP_SME_BI32I32 __khwcap2_feature(SME_BI32I32) -#define KERNEL_HWCAP_SME_B16B16 __khwcap2_feature(SME_B16B16) -#define KERNEL_HWCAP_SME_F16F16 __khwcap2_feature(SME_F16F16) -#define KERNEL_HWCAP_MOPS __khwcap2_feature(MOPS) -#define KERNEL_HWCAP_HBC __khwcap2_feature(HBC) -#define KERNEL_HWCAP_SVE_B16B16 __khwcap2_feature(SVE_B16B16) -#define KERNEL_HWCAP_LRCPC3 __khwcap2_feature(LRCPC3) -#define KERNEL_HWCAP_LSE128 __khwcap2_feature(LSE128) -#define KERNEL_HWCAP_FPMR __khwcap2_feature(FPMR) -#define KERNEL_HWCAP_LUT __khwcap2_feature(LUT) -#define KERNEL_HWCAP_FAMINMAX __khwcap2_feature(FAMINMAX) -#define KERNEL_HWCAP_F8CVT __khwcap2_feature(F8CVT) -#define KERNEL_HWCAP_F8FMA __khwcap2_feature(F8FMA) -#define KERNEL_HWCAP_F8DP4 __khwcap2_feature(F8DP4) -#define KERNEL_HWCAP_F8DP2 __khwcap2_feature(F8DP2) -#define KERNEL_HWCAP_F8E4M3 __khwcap2_feature(F8E4M3) -#define KERNEL_HWCAP_F8E5M2 __khwcap2_feature(F8E5M2) -#define KERNEL_HWCAP_SME_LUTV2 __khwcap2_feature(SME_LUTV2) -#define KERNEL_HWCAP_SME_F8F16 __khwcap2_feature(SME_F8F16) -#define KERNEL_HWCAP_SME_F8F32 __khwcap2_feature(SME_F8F32) -#define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA) -#define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4) -#define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) -#define KERNEL_HWCAP_POE __khwcap2_feature(POE) - #define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128) -#define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR) -#define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY) -#define KERNEL_HWCAP_LSFE __khwcap3_feature(LSFE) -#define KERNEL_HWCAP_LS64 __khwcap3_feature(LS64) + +#include "asm/kernel-hwcap.h" /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile index c2b34e761006..a94b3d9caad6 100644 --- a/arch/arm64/tools/Makefile +++ b/arch/arm64/tools/Makefile @@ -3,7 +3,7 @@ gen := arch/$(ARCH)/include/generated kapi := $(gen)/asm -kapisyshdr-y := cpucap-defs.h sysreg-defs.h +kapisyshdr-y := cpucap-defs.h kernel-hwcap.h sysreg-defs.h kapi-hdrs-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) @@ -18,11 +18,17 @@ kapi: $(kapi-hdrs-y) quiet_cmd_gen_cpucaps = GEN $@ cmd_gen_cpucaps = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@ +quiet_cmd_gen_kernel_hwcap = GEN $@ + cmd_gen_kernel_hwcap = mkdir -p $(dir $@); /bin/sh -e $(real-prereqs) > $@ + quiet_cmd_gen_sysreg = GEN $@ cmd_gen_sysreg = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@ $(kapi)/cpucap-defs.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE $(call if_changed,gen_cpucaps) +$(kapi)/kernel-hwcap.h: $(src)/gen-kernel-hwcaps.sh $(srctree)/arch/arm64/include/uapi/asm/hwcap.h FORCE + $(call if_changed,gen_kernel_hwcap) + $(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE $(call if_changed,gen_sysreg) diff --git a/arch/arm64/tools/gen-kernel-hwcaps.sh b/arch/arm64/tools/gen-kernel-hwcaps.sh new file mode 100644 index 000000000000..e7cdcf428d91 --- /dev/null +++ b/arch/arm64/tools/gen-kernel-hwcaps.sh @@ -0,0 +1,23 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 +# +# gen-kernel-hwcap.sh - Generate kernel internal hwcap.h definitions +# +# Copyright 2026 Arm, Ltd. + +if [ "$1" = "" ]; then + echo "$0: no filename specified" + exit 1 +fi + +echo "#ifndef __ASM_KERNEL_HWCAPS_H" +echo "#define __ASM_KERNEL_HWCAPS_H" +echo "" +echo "/* Generated file - do not edit */" +echo "" + +grep -E '^#define HWCAP[0-9]*_[A-Z0-9_]+' $1 | \ + sed 's/.*HWCAP\([0-9]*\)_\([A-Z0-9_]\+\).*/#define KERNEL_HWCAP_\2\t__khwcap\1_feature(\2)/' + +echo "" +echo "#endif /* __ASM_KERNEL_HWCAPS_H */" From b964aa8d68f7705932357483d35d82067bd755c3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 2 Mar 2026 22:53:17 +0000 Subject: [PATCH 102/109] arm64/sysreg: Update ID_AA64ISAR0_EL1 description to DDI0601 2025-12 The 2025 extensions add FEAT_F16F32DOT and FEAT_F16F32MM. Signed-off-by: Mark Brown Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index b6586accf344..cd9f325a46bc 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1835,6 +1835,8 @@ EndEnum UnsignedEnum 51:48 FHM 0b0000 NI 0b0001 IMP + 0b0010 F16F32DOT + 0b0011 F16F32MM EndEnum UnsignedEnum 47:44 DP 0b0000 NI From bb5e1e540501f068f888dca8951128d682f5ff44 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 2 Mar 2026 22:53:18 +0000 Subject: [PATCH 103/109] arm64/sysreg: Update ID_AA64ISAR2_EL1 description to DDI0601 2025-12 The 2025 extensions update the LUT field for new instructions added by SVE and SME 2.3, there is no separate FEAT_ feature for these. Signed-off-by: Mark Brown Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index cd9f325a46bc..c13de334b34d 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1978,6 +1978,7 @@ EndEnum UnsignedEnum 59:56 LUT 0b0000 NI 0b0001 IMP + 0b0010 LUT6 EndEnum UnsignedEnum 55:52 CSSC 0b0000 NI From d74576b51ba6d3a7f1f321b57ad8736f73a5074d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 2 Mar 2026 22:53:19 +0000 Subject: [PATCH 104/109] arm64/sysreg: Update ID_AA64FPFR0_EL1 description to DDI0601 2025-12 The 2025 extensions add FEAT_F16MM and adjust some of the RES0 bits to be RAZ instead as a placeholder for future extensions. Signed-off-by: Mark Brown Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index c13de334b34d..20bf5a31dc4a 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1654,7 +1654,13 @@ UnsignedEnum 26 F8MM4 0b0 NI 0b1 IMP EndEnum -Res0 25:2 +Res0 25:16 +UnsignedEnum 15 F16MM2 + 0b0 NI + 0b1 IMP +EndEnum +Res0 14:8 +Raz 7:2 UnsignedEnum 1 F8E4M3 0b0 NI 0b1 IMP From bf56250f34a40d83252e1cbc3b41955df7dc11b1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 2 Mar 2026 22:53:20 +0000 Subject: [PATCH 105/109] arm64/sysreg: Update ID_AA64ZFR0_EL1 description to DDI0601 2025-12 The 2025 extensions add FEAT_SVE2P3 and FEAT_SVE_B16MM. Signed-off-by: Mark Brown Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 20bf5a31dc4a..967d9eb6071e 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1496,6 +1496,7 @@ UnsignedEnum 27:24 B16B16 0b0000 NI 0b0001 IMP 0b0010 BFSCALE + 0b0011 B16MM EndEnum UnsignedEnum 23:20 BF16 0b0000 NI @@ -1522,6 +1523,7 @@ UnsignedEnum 3:0 SVEver 0b0001 SVE2 0b0010 SVE2p1 0b0011 SVE2p2 + 0b0100 SVE2p3 EndEnum EndSysreg From 306736fd515565c6c4787dc55aba890ebce2dc45 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 2 Mar 2026 22:53:21 +0000 Subject: [PATCH 106/109] arm64/sysreg: Update ID_AA64SMFR0_EL1 description to DDI0601 2025-12 The 2025 extensions add FEAT_SME2P3, including LUT6. Signed-off-by: Mark Brown Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 967d9eb6071e..9b1834d9c1e7 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1532,7 +1532,11 @@ UnsignedEnum 63 FA64 0b0 NI 0b1 IMP EndEnum -Res0 62:61 +Res0 62 +UnsignedEnum 61 LUT6 + 0b0 NI + 0b1 IMP +EndEnum UnsignedEnum 60 LUTv2 0b0 NI 0b1 IMP @@ -1542,6 +1546,7 @@ UnsignedEnum 59:56 SMEver 0b0001 SME2 0b0010 SME2p1 0b0011 SME2p2 + 0b0100 SME2p3 EndEnum UnsignedEnum 55:52 I16I64 0b0000 NI From 249bf9733198fe0527542ba6acf4bd8f12b7e5fc Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 11 Mar 2026 17:50:50 +0000 Subject: [PATCH 107/109] arm64: mte: Skip TFSR_EL1 checks and barriers in synchronous tag check mode With KASAN_HW_TAGS (MTE) in synchronous mode, tag check faults are reported as immediate Data Abort exceptions. The TFSR_EL1.TF1 bit is never set since faults never go through the asynchronous path. Therefore, reading TFSR_EL1 and executing data and instruction barriers on kernel entry, exit, context switch and suspend is unnecessary overhead. As with the check_mte_async_tcf and clear_mte_async_tcf paths for TFSRE0_EL1, extend the same optimisation to kernel entry/exit, context switch and suspend. All mte kselftests pass. The kunit before and after the patch show same results. A selection of test_vmalloc benchmarks running on a arm64 machine. v6.19 is the baseline. (>0 is faster, <0 is slower, (R)/(I) = statistically significant Regression/Improvement). Based on significance and ignoring the noise, the benchmarks improved. * 77 result classes were considered, with 9 wins, 0 losses and 68 ties Results of fastpath [1] on v6.19 vs this patch: +----------------------------+----------------------------------------------------------+------------+ | Benchmark | Result Class | barriers | +============================+==========================================================+============+ | micromm/fork | fork: p:1, d:10 (seconds) | (I) 2.75% | | | fork: p:512, d:10 (seconds) | 0.96% | +----------------------------+----------------------------------------------------------+------------+ | micromm/munmap | munmap: p:1, d:10 (seconds) | -1.78% | | | munmap: p:512, d:10 (seconds) | 5.02% | +----------------------------+----------------------------------------------------------+------------+ | micromm/vmalloc | fix_align_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | | | fix_size_alloc_test: p:1, h:0, l:500000 (usec) | 0.70% | | | fix_size_alloc_test: p:4, h:0, l:500000 (usec) | 1.18% | | | fix_size_alloc_test: p:16, h:0, l:500000 (usec) | -5.01% | | | fix_size_alloc_test: p:16, h:1, l:500000 (usec) | 13.81% | | | fix_size_alloc_test: p:64, h:0, l:100000 (usec) | 6.51% | | | fix_size_alloc_test: p:64, h:1, l:100000 (usec) | 32.87% | | | fix_size_alloc_test: p:256, h:0, l:100000 (usec) | 4.17% | | | fix_size_alloc_test: p:256, h:1, l:100000 (usec) | 8.40% | | | fix_size_alloc_test: p:512, h:0, l:100000 (usec) | -0.48% | | | fix_size_alloc_test: p:512, h:1, l:100000 (usec) | -0.74% | | | full_fit_alloc_test: p:1, h:0, l:500000 (usec) | 0.53% | | | kvfree_rcu_1_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.81% | | | kvfree_rcu_2_arg_vmalloc_test: p:1, h:0, l:500000 (usec) | -2.06% | | | long_busy_list_alloc_test: p:1, h:0, l:500000 (usec) | -0.56% | | | pcpu_alloc_test: p:1, h:0, l:500000 (usec) | -0.41% | | | random_size_align_alloc_test: p:1, h:0, l:500000 (usec) | 0.89% | | | random_size_alloc_test: p:1, h:0, l:500000 (usec) | 1.71% | | | vm_map_ram_test: p:1, h:0, l:500000 (usec) | 0.83% | +----------------------------+----------------------------------------------------------+------------+ | schbench/thread-contention | -m 16 -t 1 -r 10 -s 1000, avg_rps (req/sec) | 0.05% | | | -m 16 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.60% | | | -m 16 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | | | -m 16 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | | | -m 16 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | -0.58% | | | -m 16 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 9.09% | | | -m 16 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.74% | | | -m 16 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | -1.40% | | | -m 16 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | | | -m 16 -t 64 -r 10 -s 1000, avg_rps (req/sec) | -0.78% | | | -m 16 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | -0.11% | | | -m 16 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | | | -m 16 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.64% | | | -m 16 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 3.15% | | | -m 16 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 17.54% | | | -m 32 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -1.22% | | | -m 32 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | 0.85% | | | -m 32 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | | | -m 32 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.34% | | | -m 32 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 1.05% | | | -m 32 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | | | -m 32 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.41% | | | -m 32 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.58% | | | -m 32 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 2.13% | | | -m 32 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.67% | | | -m 32 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 2.07% | | | -m 32 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -1.28% | | | -m 32 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 1.01% | | | -m 32 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 0.69% | | | -m 32 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | 13.12% | | | -m 64 -t 1 -r 10 -s 1000, avg_rps (req/sec) | -0.25% | | | -m 64 -t 1 -r 10 -s 1000, req_latency_p99 (usec) | -0.48% | | | -m 64 -t 1 -r 10 -s 1000, wakeup_latency_p99 (usec) | 10.53% | | | -m 64 -t 4 -r 10 -s 1000, avg_rps (req/sec) | -0.06% | | | -m 64 -t 4 -r 10 -s 1000, req_latency_p99 (usec) | 0.00% | | | -m 64 -t 4 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.00% | | | -m 64 -t 16 -r 10 -s 1000, avg_rps (req/sec) | -0.36% | | | -m 64 -t 16 -r 10 -s 1000, req_latency_p99 (usec) | 0.52% | | | -m 64 -t 16 -r 10 -s 1000, wakeup_latency_p99 (usec) | 0.11% | | | -m 64 -t 64 -r 10 -s 1000, avg_rps (req/sec) | 0.52% | | | -m 64 -t 64 -r 10 -s 1000, req_latency_p99 (usec) | 3.53% | | | -m 64 -t 64 -r 10 -s 1000, wakeup_latency_p99 (usec) | -0.10% | | | -m 64 -t 256 -r 10 -s 1000, avg_rps (req/sec) | 2.53% | | | -m 64 -t 256 -r 10 -s 1000, req_latency_p99 (usec) | 1.82% | | | -m 64 -t 256 -r 10 -s 1000, wakeup_latency_p99 (usec) | -5.80% | +----------------------------+----------------------------------------------------------+------------+ | syscall/getpid | mean (ns) | (I) 15.98% | | | p99 (ns) | (I) 11.11% | | | p99.9 (ns) | (I) 16.13% | +----------------------------+----------------------------------------------------------+------------+ | syscall/getppid | mean (ns) | (I) 14.82% | | | p99 (ns) | (I) 17.86% | | | p99.9 (ns) | (I) 9.09% | +----------------------------+----------------------------------------------------------+------------+ | syscall/invalid | mean (ns) | (I) 17.78% | | | p99 (ns) | (I) 11.11% | | | p99.9 (ns) | 13.33% | +----------------------------+----------------------------------------------------------+------------+ [1] https://gitlab.arm.com/tooling/fastpath Signed-off-by: Muhammad Usama Anjum Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Yeoreum Yun Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mte.h | 6 ++++++ arch/arm64/kernel/mte.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 6d4a78b9dc3e..7f7b97e09996 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -252,6 +252,9 @@ static inline void mte_check_tfsr_entry(void) if (!kasan_hw_tags_enabled()) return; + if (!system_uses_mte_async_or_asymm_mode()) + return; + mte_check_tfsr_el1(); } @@ -260,6 +263,9 @@ static inline void mte_check_tfsr_exit(void) if (!kasan_hw_tags_enabled()) return; + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 32148bf09c1d..66119758561d 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -291,6 +291,9 @@ void mte_thread_switch(struct task_struct *next) /* TCO may not have been disabled on exception entry for the current task. */ mte_disable_tco_entry(next); + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * Check if an async tag exception occurred at EL1. * @@ -350,6 +353,9 @@ void mte_suspend_enter(void) if (!system_supports_mte()) return; + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * The barriers are required to guarantee that the indirect writes * to TFSR_EL1 are synchronized before we report the state. From 74b63934abf5011d4ff0ac6ece5cffca9d5647ac Mon Sep 17 00:00:00 2001 From: Michael Ugrin Date: Thu, 9 Apr 2026 09:24:12 -0700 Subject: [PATCH 108/109] arm64: Kconfig: fix duplicate word in CMDLINE help text Remove duplicate 'the' in the CMDLINE config help text. Signed-off-by: Michael Ugrin Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3d90f2d35a99..68649dd28fe6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2346,7 +2346,7 @@ config CMDLINE default "" help Provide a set of default command-line options at build time by - entering them here. As a minimum, you should specify the the + entering them here. As a minimum, you should specify the root device (e.g. root=/dev/nfs). choice From 34e563947c76201c6e941ac17dd35eb87c0e68bb Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (Arm)" Date: Tue, 7 Apr 2026 20:59:00 +0530 Subject: [PATCH 109/109] arm64: rsi: use linear-map alias for realm config buffer rsi_get_realm_config() passes its argument to virt_to_phys(), but &config is a kernel image address and not a linear-map alias. On arm64 this triggers the below warning: virt_to_phys used for non-linear address: (____ptrval____) (config+0x0/0x1000) WARNING: arch/arm64/mm/physaddr.c:15 at __virt_to_phys+0x50/0x70, CPU#0: swapper/0 Modules linked in: ..... Hardware name: linux,dummy-virt (DT) pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __virt_to_phys+0x50/0x70 lr : __virt_to_phys+0x4c/0x70 ..... ...... Call trace: __virt_to_phys+0x50/0x70 (P) arm64_rsi_init+0xa0/0x1b8 setup_arch+0x13c/0x1a0 start_kernel+0x68/0x398 __primary_switched+0x88/0x90 Pass lm_alias(&config) instead so the RSI call uses the linear-map alias of the same buffer and avoids the boot-time warning. Signed-off-by: Aneesh Kumar K.V (Arm) Signed-off-by: Catalin Marinas --- arch/arm64/kernel/rsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index c64a06f58c0b..6e883bc43186 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -144,7 +144,7 @@ void __init arm64_rsi_init(void) return; if (!rsi_version_matches()) return; - if (WARN_ON(rsi_get_realm_config(&config))) + if (WARN_ON(rsi_get_realm_config(lm_alias(&config)))) return; prot_ns_shared = BIT(config.ipa_bits - 1);