mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
It has been possible for a long time to mark ptes in the linear map as invalid. This is done for secretmem, kfence, realm dma memory un/share, and others, by simply clearing the PTE_VALID bit. But until commita166563e7e("arm64: mm: support large block mapping when rodata=full") large leaf mappings were never made invalid in this way. It turns out various parts of the code base are not equipped to handle invalid large leaf mappings (in the way they are currently encoded) and I've observed a kernel panic while booting a realm guest on a BBML2_NOABORT system as a result: [ 15.432706] software IO TLB: Memory encryption is active and system is using DMA bounce buffers [ 15.476896] Unable to handle kernel paging request at virtual address ffff000019600000 [ 15.513762] Mem abort info: [ 15.527245] ESR = 0x0000000096000046 [ 15.548553] EC = 0x25: DABT (current EL), IL = 32 bits [ 15.572146] SET = 0, FnV = 0 [ 15.592141] EA = 0, S1PTW = 0 [ 15.612694] FSC = 0x06: level 2 translation fault [ 15.640644] Data abort info: [ 15.661983] ISV = 0, ISS = 0x00000046, ISS2 = 0x00000000 [ 15.694875] CM = 0, WnR = 1, TnD = 0, TagAccess = 0 [ 15.723740] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 15.755776] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000081f3f000 [ 15.800410] [ffff000019600000] pgd=0000000000000000, p4d=180000009ffff403, pud=180000009fffe403, pmd=00e8000199600704 [ 15.855046] Internal error: Oops: 0000000096000046 [#1] SMP [ 15.886394] Modules linked in: [ 15.900029] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 7.0.0-rc4-dirty #4 PREEMPT [ 15.935258] Hardware name: linux,dummy-virt (DT) [ 15.955612] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 15.986009] pc : __pi_memcpy_generic+0x128/0x22c [ 16.006163] lr : swiotlb_bounce+0xf4/0x158 [ 16.024145] sp : ffff80008000b8f0 [ 16.038896] x29: ffff80008000b8f0 x28: 0000000000000000 x27: 0000000000000000 [ 16.069953] x26: ffffb3976d261ba8 x25: 0000000000000000 x24: ffff000019600000 [ 16.100876] x23: 0000000000000001 x22: ffff0000043430d0 x21: 0000000000007ff0 [ 16.131946] x20: 0000000084570010 x19: 0000000000000000 x18: ffff00001ffe3fcc [ 16.163073] x17: 0000000000000000 x16: 00000000003fffff x15: 646e612065766974 [ 16.194131] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 [ 16.225059] x11: 0000000000000000 x10: 0000000000000010 x9 : 0000000000000018 [ 16.256113] x8 : 0000000000000018 x7 : 0000000000000000 x6 : 0000000000000000 [ 16.287203] x5 : ffff000019607ff0 x4 : ffff000004578000 x3 : ffff000019600000 [ 16.318145] x2 : 0000000000007ff0 x1 : ffff000004570010 x0 : ffff000019600000 [ 16.349071] Call trace: [ 16.360143] __pi_memcpy_generic+0x128/0x22c (P) [ 16.380310] swiotlb_tbl_map_single+0x154/0x2b4 [ 16.400282] swiotlb_map+0x5c/0x228 [ 16.415984] dma_map_phys+0x244/0x2b8 [ 16.432199] dma_map_page_attrs+0x44/0x58 [ 16.449782] virtqueue_map_page_attrs+0x38/0x44 [ 16.469596] virtqueue_map_single_attrs+0xc0/0x130 [ 16.490509] virtnet_rq_alloc.isra.0+0xa4/0x1fc [ 16.510355] try_fill_recv+0x2a4/0x584 [ 16.526989] virtnet_open+0xd4/0x238 [ 16.542775] __dev_open+0x110/0x24c [ 16.558280] __dev_change_flags+0x194/0x20c [ 16.576879] netif_change_flags+0x24/0x6c [ 16.594489] dev_change_flags+0x48/0x7c [ 16.611462] ip_auto_config+0x258/0x1114 [ 16.628727] do_one_initcall+0x80/0x1c8 [ 16.645590] kernel_init_freeable+0x208/0x2f0 [ 16.664917] kernel_init+0x24/0x1e0 [ 16.680295] ret_from_fork+0x10/0x20 [ 16.696369] Code: 927cec03 cb0e0021 8b0e0042 a9411c26 (a900340c) [ 16.723106] ---[ end trace 0000000000000000 ]--- [ 16.752866] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b [ 16.792556] Kernel Offset: 0x3396ea200000 from 0xffff800080000000 [ 16.818966] PHYS_OFFSET: 0xfff1000080000000 [ 16.837237] CPU features: 0x0000000,00060005,13e38581,957e772f [ 16.862904] Memory Limit: none [ 16.876526] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]--- This panic occurs because the swiotlb memory was previously shared to the host (__set_memory_enc_dec()), which involves transitioning the (large) leaf mappings to invalid, sharing to the host, then marking the mappings valid again. But pageattr_p[mu]d_entry() would only update the entry if it is a section mapping, since otherwise it concluded it must be a table entry so shouldn't be modified. But p[mu]d_sect() only returns true if the entry is valid. So the result was that the large leaf entry was made invalid in the first pass then ignored in the second pass. It remains invalid until the above code tries to access it and blows up. The simple fix would be to update pageattr_pmd_entry() to use !pmd_table() instead of pmd_sect(). That would solve this problem. But the ptdump code also suffers from a similar issue. It checks pmd_leaf() and doesn't call into the arch-specific note_page() machinery if it returns false. As a result of this, ptdump wasn't even able to show the invalid large leaf mappings; it looked like they were valid which made this super fun to debug. the ptdump code is core-mm and pmd_table() is arm64-specific so we can't use the same trick to solve that. But we already support the concept of "present-invalid" for user space entries. And even better, pmd_leaf() will return true for a leaf mapping that is marked present-invalid. So let's just use that encoding for present-invalid kernel mappings too. Then we can use pmd_leaf() where we previously used pmd_sect() and everything is magically fixed. Additionally, from inspection kernel_page_present() was broken in a similar way, so I'm also updating that to use pmd_leaf(). The transitional page tables component was also similarly broken; it creates a copy of the kernel page tables, making RO leaf mappings RW in the process. It also makes invalid (but-not-none) pte mappings valid. But it was not doing this for large leaf mappings. This could have resulted in crashes at kexec- or hibernate-time. This code is fixed to flip "present-invalid" mappings back to "present-valid" at all levels. Finally, I have hardened split_pmd()/split_pud() so that if it is passed a "present-invalid" leaf, it will maintain that property in the split leaves, since I wasn't able to convince myself that it would only ever be called for "present-valid" leaves. Fixes:a166563e7e("arm64: mm: support large block mapping when rodata=full") Cc: stable@vger.kernel.org Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
192 lines
8.3 KiB
C
192 lines
8.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Copyright (C) 2016 ARM Ltd.
|
|
*/
|
|
#ifndef __ASM_PGTABLE_PROT_H
|
|
#define __ASM_PGTABLE_PROT_H
|
|
|
|
#include <asm/memory.h>
|
|
#include <asm/pgtable-hwdef.h>
|
|
|
|
#include <linux/const.h>
|
|
|
|
/*
|
|
* Software defined PTE bits definition.
|
|
*/
|
|
#define PTE_WRITE (PTE_DBM) /* same as DBM (51) */
|
|
#define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */
|
|
#define PTE_DIRTY (_AT(pteval_t, 1) << 55)
|
|
#define PTE_SPECIAL (_AT(pteval_t, 1) << 56)
|
|
|
|
/*
|
|
* PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be
|
|
* interpreted according to the HW layout by SW but any attempted HW access to
|
|
* the address will result in a fault. pte_present() returns true.
|
|
*/
|
|
#define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */
|
|
|
|
#define PTE_PRESENT_VALID_KERNEL (PTE_VALID | PTE_MAYBE_NG)
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
|
|
#define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */
|
|
#define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */
|
|
#else
|
|
#define PTE_UFFD_WP (_AT(pteval_t, 0))
|
|
#define PTE_SWP_UFFD_WP (_AT(pteval_t, 0))
|
|
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
|
|
|
|
#define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
|
|
|
|
#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF)
|
|
#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF)
|
|
|
|
#define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
|
|
#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
|
|
#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC))
|
|
#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL))
|
|
#define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED))
|
|
|
|
#define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
|
|
#define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PTE_WRITE | PMD_ATTRINDX(MT_NORMAL))
|
|
#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
|
|
|
|
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
|
|
|
|
#define _PAGE_KERNEL (PROT_NORMAL | PTE_DIRTY)
|
|
#define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY | PTE_DIRTY)
|
|
#define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY | PTE_DIRTY)
|
|
#define _PAGE_KERNEL_EXEC ((PROT_NORMAL & ~PTE_PXN) | PTE_DIRTY)
|
|
#define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT | PTE_DIRTY)
|
|
|
|
#define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
|
|
#define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE)
|
|
#define _PAGE_READONLY (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
|
|
#define _PAGE_READONLY_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN)
|
|
#define _PAGE_EXECONLY (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN)
|
|
|
|
#ifndef __ASSEMBLER__
|
|
|
|
#include <asm/cpufeature.h>
|
|
#include <asm/pgtable-types.h>
|
|
#include <asm/rsi.h>
|
|
|
|
extern bool arm64_use_ng_mappings;
|
|
extern unsigned long prot_ns_shared;
|
|
|
|
#define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0)
|
|
|
|
#define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0)
|
|
#define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0)
|
|
|
|
#ifndef CONFIG_ARM64_LPA2
|
|
#define lpa2_is_enabled() false
|
|
#define PTE_MAYBE_SHARED PTE_SHARED
|
|
#define PMD_MAYBE_SHARED PMD_SECT_S
|
|
#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS)
|
|
#else
|
|
static inline bool __pure lpa2_is_enabled(void)
|
|
{
|
|
return read_tcr() & TCR_EL1_DS;
|
|
}
|
|
|
|
#define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED)
|
|
#define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S)
|
|
#define PHYS_MASK_SHIFT (lpa2_is_enabled() ? CONFIG_ARM64_PA_BITS : 48)
|
|
#endif
|
|
|
|
/*
|
|
* Highest possible physical address supported.
|
|
*/
|
|
#define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1)
|
|
|
|
/*
|
|
* If we have userspace only BTI we don't want to mark kernel pages
|
|
* guarded even if the system does support BTI.
|
|
*/
|
|
#define PTE_MAYBE_GP (system_supports_bti_kernel() ? PTE_GP : 0)
|
|
|
|
#define PAGE_KERNEL __pgprot(_PAGE_KERNEL)
|
|
#define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO)
|
|
#define PAGE_KERNEL_ROX __pgprot(_PAGE_KERNEL_ROX)
|
|
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC)
|
|
#define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_KERNEL_EXEC_CONT)
|
|
|
|
#define PAGE_S2_MEMATTR(attr) \
|
|
({ \
|
|
u64 __val; \
|
|
if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) \
|
|
__val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \
|
|
else \
|
|
__val = PTE_S2_MEMATTR(MT_S2_ ## attr); \
|
|
__val; \
|
|
})
|
|
|
|
#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
|
|
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
|
|
#define PAGE_SHARED __pgprot(_PAGE_SHARED)
|
|
#define PAGE_SHARED_EXEC __pgprot(_PAGE_SHARED_EXEC)
|
|
#define PAGE_READONLY __pgprot(_PAGE_READONLY)
|
|
#define PAGE_READONLY_EXEC __pgprot(_PAGE_READONLY_EXEC)
|
|
#define PAGE_EXECONLY __pgprot(_PAGE_EXECONLY)
|
|
|
|
#endif /* __ASSEMBLER__ */
|
|
|
|
#define pte_pi_index(pte) ( \
|
|
((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \
|
|
((pte & BIT(PTE_PI_IDX_2)) >> (PTE_PI_IDX_2 - 2)) | \
|
|
((pte & BIT(PTE_PI_IDX_1)) >> (PTE_PI_IDX_1 - 1)) | \
|
|
((pte & BIT(PTE_PI_IDX_0)) >> (PTE_PI_IDX_0 - 0)))
|
|
|
|
/*
|
|
* Page types used via Permission Indirection Extension (PIE). PIE uses
|
|
* the USER, DBM, PXN and UXN bits to to generate an index which is used
|
|
* to look up the actual permission in PIR_ELx and PIRE0_EL1. We define
|
|
* combinations we use on non-PIE systems with the same encoding, for
|
|
* convenience these are listed here as comments as are the unallocated
|
|
* encodings.
|
|
*/
|
|
|
|
/* 0: PAGE_DEFAULT */
|
|
/* 1: PTE_USER */
|
|
/* 2: PTE_WRITE */
|
|
/* 3: PTE_WRITE | PTE_USER */
|
|
/* 4: PAGE_EXECONLY PTE_PXN */
|
|
/* 5: PAGE_READONLY_EXEC PTE_PXN | PTE_USER */
|
|
/* 6: PTE_PXN | PTE_WRITE */
|
|
/* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */
|
|
/* 8: PAGE_KERNEL_ROX PTE_UXN */
|
|
/* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */
|
|
/* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */
|
|
/* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */
|
|
/* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */
|
|
/* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */
|
|
/* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */
|
|
/* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */
|
|
|
|
#define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER)
|
|
#define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER)
|
|
|
|
#define PIE_E0 ( \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O))
|
|
|
|
#define PIE_E1 ( \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \
|
|
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW))
|
|
|
|
#endif /* __ASM_PGTABLE_PROT_H */
|