Files
linux/arch/arm64/mm/trans_pgd.c
Ryan Roberts 15bfba1ad7 arm64: mm: Handle invalid large leaf mappings correctly
It has been possible for a long time to mark ptes in the linear map as
invalid. This is done for secretmem, kfence, realm dma memory un/share,
and others, by simply clearing the PTE_VALID bit. But until commit
a166563e7e ("arm64: mm: support large block mapping when
rodata=full") large leaf mappings were never made invalid in this way.

It turns out various parts of the code base are not equipped to handle
invalid large leaf mappings (in the way they are currently encoded) and
I've observed a kernel panic while booting a realm guest on a
BBML2_NOABORT system as a result:

[   15.432706] software IO TLB: Memory encryption is active and system is using DMA bounce buffers
[   15.476896] Unable to handle kernel paging request at virtual address ffff000019600000
[   15.513762] Mem abort info:
[   15.527245]   ESR = 0x0000000096000046
[   15.548553]   EC = 0x25: DABT (current EL), IL = 32 bits
[   15.572146]   SET = 0, FnV = 0
[   15.592141]   EA = 0, S1PTW = 0
[   15.612694]   FSC = 0x06: level 2 translation fault
[   15.640644] Data abort info:
[   15.661983]   ISV = 0, ISS = 0x00000046, ISS2 = 0x00000000
[   15.694875]   CM = 0, WnR = 1, TnD = 0, TagAccess = 0
[   15.723740]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[   15.755776] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000081f3f000
[   15.800410] [ffff000019600000] pgd=0000000000000000, p4d=180000009ffff403, pud=180000009fffe403, pmd=00e8000199600704
[   15.855046] Internal error: Oops: 0000000096000046 [#1]  SMP
[   15.886394] Modules linked in:
[   15.900029] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 7.0.0-rc4-dirty #4 PREEMPT
[   15.935258] Hardware name: linux,dummy-virt (DT)
[   15.955612] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
[   15.986009] pc : __pi_memcpy_generic+0x128/0x22c
[   16.006163] lr : swiotlb_bounce+0xf4/0x158
[   16.024145] sp : ffff80008000b8f0
[   16.038896] x29: ffff80008000b8f0 x28: 0000000000000000 x27: 0000000000000000
[   16.069953] x26: ffffb3976d261ba8 x25: 0000000000000000 x24: ffff000019600000
[   16.100876] x23: 0000000000000001 x22: ffff0000043430d0 x21: 0000000000007ff0
[   16.131946] x20: 0000000084570010 x19: 0000000000000000 x18: ffff00001ffe3fcc
[   16.163073] x17: 0000000000000000 x16: 00000000003fffff x15: 646e612065766974
[   16.194131] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
[   16.225059] x11: 0000000000000000 x10: 0000000000000010 x9 : 0000000000000018
[   16.256113] x8 : 0000000000000018 x7 : 0000000000000000 x6 : 0000000000000000
[   16.287203] x5 : ffff000019607ff0 x4 : ffff000004578000 x3 : ffff000019600000
[   16.318145] x2 : 0000000000007ff0 x1 : ffff000004570010 x0 : ffff000019600000
[   16.349071] Call trace:
[   16.360143]  __pi_memcpy_generic+0x128/0x22c (P)
[   16.380310]  swiotlb_tbl_map_single+0x154/0x2b4
[   16.400282]  swiotlb_map+0x5c/0x228
[   16.415984]  dma_map_phys+0x244/0x2b8
[   16.432199]  dma_map_page_attrs+0x44/0x58
[   16.449782]  virtqueue_map_page_attrs+0x38/0x44
[   16.469596]  virtqueue_map_single_attrs+0xc0/0x130
[   16.490509]  virtnet_rq_alloc.isra.0+0xa4/0x1fc
[   16.510355]  try_fill_recv+0x2a4/0x584
[   16.526989]  virtnet_open+0xd4/0x238
[   16.542775]  __dev_open+0x110/0x24c
[   16.558280]  __dev_change_flags+0x194/0x20c
[   16.576879]  netif_change_flags+0x24/0x6c
[   16.594489]  dev_change_flags+0x48/0x7c
[   16.611462]  ip_auto_config+0x258/0x1114
[   16.628727]  do_one_initcall+0x80/0x1c8
[   16.645590]  kernel_init_freeable+0x208/0x2f0
[   16.664917]  kernel_init+0x24/0x1e0
[   16.680295]  ret_from_fork+0x10/0x20
[   16.696369] Code: 927cec03 cb0e0021 8b0e0042 a9411c26 (a900340c)
[   16.723106] ---[ end trace 0000000000000000 ]---
[   16.752866] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
[   16.792556] Kernel Offset: 0x3396ea200000 from 0xffff800080000000
[   16.818966] PHYS_OFFSET: 0xfff1000080000000
[   16.837237] CPU features: 0x0000000,00060005,13e38581,957e772f
[   16.862904] Memory Limit: none
[   16.876526] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---

This panic occurs because the swiotlb memory was previously shared to
the host (__set_memory_enc_dec()), which involves transitioning the
(large) leaf mappings to invalid, sharing to the host, then marking the
mappings valid again. But pageattr_p[mu]d_entry() would only update the
entry if it is a section mapping, since otherwise it concluded it must
be a table entry so shouldn't be modified. But p[mu]d_sect() only
returns true if the entry is valid. So the result was that the large
leaf entry was made invalid in the first pass then ignored in the second
pass. It remains invalid until the above code tries to access it and
blows up.

The simple fix would be to update pageattr_pmd_entry() to use
!pmd_table() instead of pmd_sect(). That would solve this problem.

But the ptdump code also suffers from a similar issue. It checks
pmd_leaf() and doesn't call into the arch-specific note_page() machinery
if it returns false. As a result of this, ptdump wasn't even able to
show the invalid large leaf mappings; it looked like they were valid
which made this super fun to debug. the ptdump code is core-mm and
pmd_table() is arm64-specific so we can't use the same trick to solve
that.

But we already support the concept of "present-invalid" for user space
entries. And even better, pmd_leaf() will return true for a leaf mapping
that is marked present-invalid. So let's just use that encoding for
present-invalid kernel mappings too. Then we can use pmd_leaf() where we
previously used pmd_sect() and everything is magically fixed.

Additionally, from inspection kernel_page_present() was broken in a
similar way, so I'm also updating that to use pmd_leaf().

The transitional page tables component was also similarly broken; it
creates a copy of the kernel page tables, making RO leaf mappings RW in
the process. It also makes invalid (but-not-none) pte mappings valid.
But it was not doing this for large leaf mappings. This could have
resulted in crashes at kexec- or hibernate-time. This code is fixed to
flip "present-invalid" mappings back to "present-valid" at all levels.

Finally, I have hardened split_pmd()/split_pud() so that if it is passed
a "present-invalid" leaf, it will maintain that property in the split
leaves, since I wasn't able to convince myself that it would only ever
be called for "present-valid" leaves.

Fixes: a166563e7e ("arm64: mm: support large block mapping when rodata=full")
Cc: stable@vger.kernel.org
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2026-04-02 20:49:16 +01:00

274 lines
7.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Transitional page tables for kexec and hibernate
*
* This file derived from: arch/arm64/kernel/hibernate.c
*
* Copyright (c) 2021, Microsoft Corporation.
* Pasha Tatashin <pasha.tatashin@soleen.com>
*
*/
/*
* Transitional tables are used during system transferring from one world to
* another: such as during hibernate restore, and kexec reboots. During these
* phases one cannot rely on page table not being overwritten. This is because
* hibernate and kexec can overwrite the current page tables during transition.
*/
#include <asm/trans_pgd.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <linux/suspend.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/kfence.h>
static void *trans_alloc(struct trans_pgd_info *info)
{
return info->trans_alloc_page(info->trans_alloc_arg);
}
static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
pmd_t *src_pmdp, unsigned long start, unsigned long end)
{
pte_t *src_ptep;
pte_t *dst_ptep;
unsigned long addr = start;
dst_ptep = trans_alloc(info);
if (!dst_ptep)
return -ENOMEM;
pmd_populate_kernel(NULL, dst_pmdp, dst_ptep);
dst_ptep = pte_offset_kernel(dst_pmdp, start);
src_ptep = pte_offset_kernel(src_pmdp, start);
do {
pte_t pte = __ptep_get(src_ptep);
if (pte_none(pte))
continue;
__set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte)));
} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
return 0;
}
static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
pud_t *src_pudp, unsigned long start, unsigned long end)
{
pmd_t *src_pmdp;
pmd_t *dst_pmdp;
unsigned long next;
unsigned long addr = start;
if (pud_none(READ_ONCE(*dst_pudp))) {
dst_pmdp = trans_alloc(info);
if (!dst_pmdp)
return -ENOMEM;
pud_populate(NULL, dst_pudp, dst_pmdp);
}
dst_pmdp = pmd_offset(dst_pudp, start);
src_pmdp = pmd_offset(src_pudp, start);
do {
pmd_t pmd = READ_ONCE(*src_pmdp);
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
continue;
if (pmd_table(pmd)) {
if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
return -ENOMEM;
} else {
set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd)));
}
} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
return 0;
}
static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
p4d_t *src_p4dp, unsigned long start,
unsigned long end)
{
pud_t *dst_pudp;
pud_t *src_pudp;
unsigned long next;
unsigned long addr = start;
if (p4d_none(READ_ONCE(*dst_p4dp))) {
dst_pudp = trans_alloc(info);
if (!dst_pudp)
return -ENOMEM;
p4d_populate(NULL, dst_p4dp, dst_pudp);
}
dst_pudp = pud_offset(dst_p4dp, start);
src_pudp = pud_offset(src_p4dp, start);
do {
pud_t pud = READ_ONCE(*src_pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
continue;
if (pud_table(pud)) {
if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
return -ENOMEM;
} else {
set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud)));
}
} while (dst_pudp++, src_pudp++, addr = next, addr != end);
return 0;
}
static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp,
pgd_t *src_pgdp, unsigned long start,
unsigned long end)
{
p4d_t *dst_p4dp;
p4d_t *src_p4dp;
unsigned long next;
unsigned long addr = start;
if (pgd_none(READ_ONCE(*dst_pgdp))) {
dst_p4dp = trans_alloc(info);
if (!dst_p4dp)
return -ENOMEM;
pgd_populate(NULL, dst_pgdp, dst_p4dp);
}
dst_p4dp = p4d_offset(dst_pgdp, start);
src_p4dp = p4d_offset(src_pgdp, start);
do {
next = p4d_addr_end(addr, end);
if (p4d_none(READ_ONCE(*src_p4dp)))
continue;
if (copy_pud(info, dst_p4dp, src_p4dp, addr, next))
return -ENOMEM;
} while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
return 0;
}
static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp,
unsigned long start, unsigned long end)
{
unsigned long next;
unsigned long addr = start;
pgd_t *src_pgdp = pgd_offset_k(start);
dst_pgdp = pgd_offset_pgd(dst_pgdp, start);
do {
next = pgd_addr_end(addr, end);
if (pgd_none(READ_ONCE(*src_pgdp)))
continue;
if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next))
return -ENOMEM;
} while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
return 0;
}
/*
* Create trans_pgd and copy linear map.
* info: contains allocator and its argument
* dst_pgdp: new page table that is created, and to which map is copied.
* start: Start of the interval (inclusive).
* end: End of the interval (exclusive).
*
* Returns 0 on success, and -ENOMEM on failure.
*/
int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp,
unsigned long start, unsigned long end)
{
int rc;
pgd_t *trans_pgd = trans_alloc(info);
if (!trans_pgd) {
pr_err("Failed to allocate memory for temporary page tables.\n");
return -ENOMEM;
}
rc = copy_page_tables(info, trans_pgd, start, end);
if (!rc)
*dst_pgdp = trans_pgd;
return rc;
}
/*
* The page we want to idmap may be outside the range covered by VA_BITS that
* can be built using the kernel's p?d_populate() helpers. As a one off, for a
* single page, we build these page tables bottom up and just assume that will
* need the maximum T0SZ.
*
* Returns 0 on success, and -ENOMEM on failure.
* On success trans_ttbr0 contains page table with idmapped page, t0sz is set to
* maximum T0SZ for this page.
*/
int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
unsigned long *t0sz, void *page)
{
phys_addr_t dst_addr = virt_to_phys(page);
unsigned long pfn = __phys_to_pfn(dst_addr);
int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47;
int bits_mapped = PAGE_SHIFT - 4;
unsigned long level_mask, prev_level_entry, *levels[4];
int this_level, index, level_lsb, level_msb;
dst_addr &= PAGE_MASK;
prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_ROX));
for (this_level = 3; this_level >= 0; this_level--) {
levels[this_level] = trans_alloc(info);
if (!levels[this_level])
return -ENOMEM;
level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level);
level_msb = min(level_lsb + bits_mapped, max_msb);
level_mask = GENMASK_ULL(level_msb, level_lsb);
index = (dst_addr & level_mask) >> level_lsb;
*(levels[this_level] + index) = prev_level_entry;
pfn = virt_to_pfn(levels[this_level]);
prev_level_entry = pte_val(pfn_pte(pfn,
__pgprot(PMD_TYPE_TABLE)));
if (level_msb == max_msb)
break;
}
*trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn));
*t0sz = TCR_T0SZ(max_msb + 1);
return 0;
}
/*
* Create a copy of the vector table so we can call HVC_SET_VECTORS or
* HVC_SOFT_RESTART from contexts where the table may be overwritten.
*/
int trans_pgd_copy_el2_vectors(struct trans_pgd_info *info,
phys_addr_t *el2_vectors)
{
void *hyp_stub = trans_alloc(info);
if (!hyp_stub)
return -ENOMEM;
*el2_vectors = virt_to_phys(hyp_stub);
memcpy(hyp_stub, &trans_pgd_stub_vectors, ARM64_VECTOR_TABLE_LEN);
caches_clean_inval_pou((unsigned long)hyp_stub,
(unsigned long)hyp_stub +
ARM64_VECTOR_TABLE_LEN);
dcache_clean_inval_poc((unsigned long)hyp_stub,
(unsigned long)hyp_stub +
ARM64_VECTOR_TABLE_LEN);
return 0;
}