mm/sparse: add vmemmap_*_hvo functions

Add a few functions to enable early HVO:

vmemmap_populate_hvo
vmemmap_undo_hvo
vmemmap_wrprotect_hvo

The populate and undo functions are expected to be used in early init,
from the sparse_init_nid_early() function.  The wrprotect function is to
be used, potentially, later.

To implement these functions, mostly re-use the existing compound pages
vmemmap logic used by DAX.  vmemmap_populate_address has its argument
changed a bit in this commit: the page structure passed in to be reused in
the mapping is replaced by a PFN and a flag.  The flag indicates whether
an extra ref should be taken on the vmemmap page containing the head page
structure.  Taking the ref is appropriate to for DAX / ZONE_DEVICE, but
not for HugeTLB HVO.

The HugeTLB vmemmap optimization maps tail page structure pages read-only.
The vmemmap_wrprotect_hvo function that does this is implemented
separately, because it cannot be guaranteed that reserved page structures
will not be write accessed during memory initialization.  Even with
CONFIG_DEFERRED_STRUCT_PAGE_INIT, they might still be written to (if they
are at the bottom of a zone).  So, vmemmap_populate_hvo leaves the tail
page structure pages RW initially, and then later during initialization,
after memmap init is fully done, vmemmap_wrprotect_hvo must be called to
finish the job.

Subsequent commits will use these functions for early HugeTLB HVO.

Link: https://lkml.kernel.org/r/20250228182928.2645936-15-fvdl@google.com
Signed-off-by: Frank van der Linden <fvdl@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Frank van der Linden
2025-02-28 18:29:15 +00:00
committed by Andrew Morton
parent 14ed3a595f
commit 9eb6207b78
2 changed files with 135 additions and 15 deletions

View File

@@ -30,6 +30,13 @@
#include <asm/dma.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
/*
* Flags for vmemmap_populate_range and friends.
*/
/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
#define VMEMMAP_POPULATE_PAGEREF 0x0001
#include "internal.h"
@@ -144,17 +151,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap,
struct page *reuse)
unsigned long ptpfn, unsigned long flags)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(ptep_get(pte))) {
pte_t entry;
void *p;
if (!reuse) {
if (ptpfn == (unsigned long)-1) {
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
ptpfn = PHYS_PFN(__pa(p));
} else {
/*
* When a PTE/PMD entry is freed from the init_mm
@@ -165,10 +173,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
* and through vmemmap_populate_compound_pages() when
* slab is available.
*/
get_page(reuse);
p = page_to_virt(reuse);
if (flags & VMEMMAP_POPULATE_PAGEREF)
get_page(pfn_to_page(ptpfn));
}
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
entry = pfn_pte(ptpfn, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
return pte;
@@ -238,7 +246,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
struct vmem_altmap *altmap,
struct page *reuse)
unsigned long ptpfn,
unsigned long flags)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -258,7 +267,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return NULL;
pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
if (!pte)
return NULL;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -269,13 +278,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
static int __meminit vmemmap_populate_range(unsigned long start,
unsigned long end, int node,
struct vmem_altmap *altmap,
struct page *reuse)
unsigned long ptpfn,
unsigned long flags)
{
unsigned long addr = start;
pte_t *pte;
for (; addr < end; addr += PAGE_SIZE) {
pte = vmemmap_populate_address(addr, node, altmap, reuse);
pte = vmemmap_populate_address(addr, node, altmap,
ptpfn, flags);
if (!pte)
return -ENOMEM;
}
@@ -286,7 +297,107 @@ static int __meminit vmemmap_populate_range(unsigned long start,
int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap)
{
return vmemmap_populate_range(start, end, node, altmap, NULL);
return vmemmap_populate_range(start, end, node, altmap, -1, 0);
}
/*
* Undo populate_hvo, and replace it with a normal base page mapping.
* Used in memory init in case a HVO mapping needs to be undone.
*
* This can happen when it is discovered that a memblock allocated
* hugetlb page spans multiple zones, which can only be verified
* after zones have been initialized.
*
* We know that:
* 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
* allocated through memblock, and mapped.
*
* 2) The rest of the vmemmap pages are mirrors of the last head page.
*/
int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
int node, unsigned long headsize)
{
unsigned long maddr, pfn;
pte_t *pte;
int headpages;
/*
* Should only be called early in boot, so nothing will
* be accessing these page structures.
*/
WARN_ON(!early_boot_irqs_disabled);
headpages = headsize >> PAGE_SHIFT;
/*
* Clear mirrored mappings for tail page structs.
*/
for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
pte = virt_to_kpte(maddr);
pte_clear(&init_mm, maddr, pte);
}
/*
* Clear and free mappings for head page and first tail page
* structs.
*/
for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
pte = virt_to_kpte(maddr);
pfn = pte_pfn(ptep_get(pte));
pte_clear(&init_mm, maddr, pte);
memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
}
flush_tlb_kernel_range(addr, end);
return vmemmap_populate(addr, end, node, NULL);
}
/*
* Write protect the mirrored tail page structs for HVO. This will be
* called from the hugetlb code when gathering and initializing the
* memblock allocated gigantic pages. The write protect can't be
* done earlier, since it can't be guaranteed that the reserved
* page structures will not be written to during initialization,
* even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
*
* The PTEs are known to exist, and nothing else should be touching
* these pages. The caller is responsible for any TLB flushing.
*/
void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
int node, unsigned long headsize)
{
unsigned long maddr;
pte_t *pte;
for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
pte = virt_to_kpte(maddr);
ptep_set_wrprotect(&init_mm, maddr, pte);
}
}
/*
* Populate vmemmap pages HVO-style. The first page contains the head
* page and needed tail pages, the other ones are mirrors of the first
* page.
*/
int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
int node, unsigned long headsize)
{
pte_t *pte;
unsigned long maddr;
for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
}
/*
* Reuse the last page struct page mapped above for the rest.
*/
return vmemmap_populate_range(maddr, end, node, NULL,
pte_pfn(ptep_get(pte)), 0);
}
void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
@@ -409,7 +520,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
* with just tail struct pages.
*/
return vmemmap_populate_range(start, end, node, NULL,
pte_page(ptep_get(pte)));
pte_pfn(ptep_get(pte)),
VMEMMAP_POPULATE_PAGEREF);
}
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -417,13 +529,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
unsigned long next, last = addr + size;
/* Populate the head page vmemmap page */
pte = vmemmap_populate_address(addr, node, NULL, NULL);
pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
/* Populate the tail pages vmemmap page */
next = addr + PAGE_SIZE;
pte = vmemmap_populate_address(next, node, NULL, NULL);
pte = vmemmap_populate_address(next, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
@@ -433,7 +545,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
*/
next += PAGE_SIZE;
rc = vmemmap_populate_range(next, last, node, NULL,
pte_page(ptep_get(pte)));
pte_pfn(ptep_get(pte)),
VMEMMAP_POPULATE_PAGEREF);
if (rc)
return -ENOMEM;
}