mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
mm/sparse: add vmemmap_*_hvo functions
Add a few functions to enable early HVO: vmemmap_populate_hvo vmemmap_undo_hvo vmemmap_wrprotect_hvo The populate and undo functions are expected to be used in early init, from the sparse_init_nid_early() function. The wrprotect function is to be used, potentially, later. To implement these functions, mostly re-use the existing compound pages vmemmap logic used by DAX. vmemmap_populate_address has its argument changed a bit in this commit: the page structure passed in to be reused in the mapping is replaced by a PFN and a flag. The flag indicates whether an extra ref should be taken on the vmemmap page containing the head page structure. Taking the ref is appropriate to for DAX / ZONE_DEVICE, but not for HugeTLB HVO. The HugeTLB vmemmap optimization maps tail page structure pages read-only. The vmemmap_wrprotect_hvo function that does this is implemented separately, because it cannot be guaranteed that reserved page structures will not be write accessed during memory initialization. Even with CONFIG_DEFERRED_STRUCT_PAGE_INIT, they might still be written to (if they are at the bottom of a zone). So, vmemmap_populate_hvo leaves the tail page structure pages RW initially, and then later during initialization, after memmap init is fully done, vmemmap_wrprotect_hvo must be called to finish the job. Subsequent commits will use these functions for early HugeTLB HVO. Link: https://lkml.kernel.org/r/20250228182928.2645936-15-fvdl@google.com Signed-off-by: Frank van der Linden <fvdl@google.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Dan Carpenter <dan.carpenter@linaro.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Joao Martins <joao.m.martins@oracle.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Madhavan Srinivasan <maddy@linux.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Muchun Song <muchun.song@linux.dev> Cc: Oscar Salvador <osalvador@suse.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin (Cruise) <roman.gushchin@linux.dev> Cc: Usama Arif <usamaarif642@gmail.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
committed by
Andrew Morton
parent
14ed3a595f
commit
9eb6207b78
@@ -30,6 +30,13 @@
|
||||
|
||||
#include <asm/dma.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
/*
|
||||
* Flags for vmemmap_populate_range and friends.
|
||||
*/
|
||||
/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
|
||||
#define VMEMMAP_POPULATE_PAGEREF 0x0001
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
@@ -144,17 +151,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
|
||||
|
||||
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
|
||||
struct vmem_altmap *altmap,
|
||||
struct page *reuse)
|
||||
unsigned long ptpfn, unsigned long flags)
|
||||
{
|
||||
pte_t *pte = pte_offset_kernel(pmd, addr);
|
||||
if (pte_none(ptep_get(pte))) {
|
||||
pte_t entry;
|
||||
void *p;
|
||||
|
||||
if (!reuse) {
|
||||
if (ptpfn == (unsigned long)-1) {
|
||||
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
|
||||
if (!p)
|
||||
return NULL;
|
||||
ptpfn = PHYS_PFN(__pa(p));
|
||||
} else {
|
||||
/*
|
||||
* When a PTE/PMD entry is freed from the init_mm
|
||||
@@ -165,10 +173,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
|
||||
* and through vmemmap_populate_compound_pages() when
|
||||
* slab is available.
|
||||
*/
|
||||
get_page(reuse);
|
||||
p = page_to_virt(reuse);
|
||||
if (flags & VMEMMAP_POPULATE_PAGEREF)
|
||||
get_page(pfn_to_page(ptpfn));
|
||||
}
|
||||
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
|
||||
entry = pfn_pte(ptpfn, PAGE_KERNEL);
|
||||
set_pte_at(&init_mm, addr, pte, entry);
|
||||
}
|
||||
return pte;
|
||||
@@ -238,7 +246,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
|
||||
|
||||
static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
|
||||
struct vmem_altmap *altmap,
|
||||
struct page *reuse)
|
||||
unsigned long ptpfn,
|
||||
unsigned long flags)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
@@ -258,7 +267,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
|
||||
pmd = vmemmap_pmd_populate(pud, addr, node);
|
||||
if (!pmd)
|
||||
return NULL;
|
||||
pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
|
||||
pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
|
||||
if (!pte)
|
||||
return NULL;
|
||||
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
||||
@@ -269,13 +278,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
|
||||
static int __meminit vmemmap_populate_range(unsigned long start,
|
||||
unsigned long end, int node,
|
||||
struct vmem_altmap *altmap,
|
||||
struct page *reuse)
|
||||
unsigned long ptpfn,
|
||||
unsigned long flags)
|
||||
{
|
||||
unsigned long addr = start;
|
||||
pte_t *pte;
|
||||
|
||||
for (; addr < end; addr += PAGE_SIZE) {
|
||||
pte = vmemmap_populate_address(addr, node, altmap, reuse);
|
||||
pte = vmemmap_populate_address(addr, node, altmap,
|
||||
ptpfn, flags);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
}
|
||||
@@ -286,7 +297,107 @@ static int __meminit vmemmap_populate_range(unsigned long start,
|
||||
int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
|
||||
int node, struct vmem_altmap *altmap)
|
||||
{
|
||||
return vmemmap_populate_range(start, end, node, altmap, NULL);
|
||||
return vmemmap_populate_range(start, end, node, altmap, -1, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Undo populate_hvo, and replace it with a normal base page mapping.
|
||||
* Used in memory init in case a HVO mapping needs to be undone.
|
||||
*
|
||||
* This can happen when it is discovered that a memblock allocated
|
||||
* hugetlb page spans multiple zones, which can only be verified
|
||||
* after zones have been initialized.
|
||||
*
|
||||
* We know that:
|
||||
* 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
|
||||
* allocated through memblock, and mapped.
|
||||
*
|
||||
* 2) The rest of the vmemmap pages are mirrors of the last head page.
|
||||
*/
|
||||
int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
|
||||
int node, unsigned long headsize)
|
||||
{
|
||||
unsigned long maddr, pfn;
|
||||
pte_t *pte;
|
||||
int headpages;
|
||||
|
||||
/*
|
||||
* Should only be called early in boot, so nothing will
|
||||
* be accessing these page structures.
|
||||
*/
|
||||
WARN_ON(!early_boot_irqs_disabled);
|
||||
|
||||
headpages = headsize >> PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* Clear mirrored mappings for tail page structs.
|
||||
*/
|
||||
for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
|
||||
pte = virt_to_kpte(maddr);
|
||||
pte_clear(&init_mm, maddr, pte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear and free mappings for head page and first tail page
|
||||
* structs.
|
||||
*/
|
||||
for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
|
||||
pte = virt_to_kpte(maddr);
|
||||
pfn = pte_pfn(ptep_get(pte));
|
||||
pte_clear(&init_mm, maddr, pte);
|
||||
memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
|
||||
}
|
||||
|
||||
flush_tlb_kernel_range(addr, end);
|
||||
|
||||
return vmemmap_populate(addr, end, node, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write protect the mirrored tail page structs for HVO. This will be
|
||||
* called from the hugetlb code when gathering and initializing the
|
||||
* memblock allocated gigantic pages. The write protect can't be
|
||||
* done earlier, since it can't be guaranteed that the reserved
|
||||
* page structures will not be written to during initialization,
|
||||
* even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
|
||||
*
|
||||
* The PTEs are known to exist, and nothing else should be touching
|
||||
* these pages. The caller is responsible for any TLB flushing.
|
||||
*/
|
||||
void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
|
||||
int node, unsigned long headsize)
|
||||
{
|
||||
unsigned long maddr;
|
||||
pte_t *pte;
|
||||
|
||||
for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
|
||||
pte = virt_to_kpte(maddr);
|
||||
ptep_set_wrprotect(&init_mm, maddr, pte);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Populate vmemmap pages HVO-style. The first page contains the head
|
||||
* page and needed tail pages, the other ones are mirrors of the first
|
||||
* page.
|
||||
*/
|
||||
int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
|
||||
int node, unsigned long headsize)
|
||||
{
|
||||
pte_t *pte;
|
||||
unsigned long maddr;
|
||||
|
||||
for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
|
||||
pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reuse the last page struct page mapped above for the rest.
|
||||
*/
|
||||
return vmemmap_populate_range(maddr, end, node, NULL,
|
||||
pte_pfn(ptep_get(pte)), 0);
|
||||
}
|
||||
|
||||
void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
|
||||
@@ -409,7 +520,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
|
||||
* with just tail struct pages.
|
||||
*/
|
||||
return vmemmap_populate_range(start, end, node, NULL,
|
||||
pte_page(ptep_get(pte)));
|
||||
pte_pfn(ptep_get(pte)),
|
||||
VMEMMAP_POPULATE_PAGEREF);
|
||||
}
|
||||
|
||||
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
|
||||
@@ -417,13 +529,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
|
||||
unsigned long next, last = addr + size;
|
||||
|
||||
/* Populate the head page vmemmap page */
|
||||
pte = vmemmap_populate_address(addr, node, NULL, NULL);
|
||||
pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Populate the tail pages vmemmap page */
|
||||
next = addr + PAGE_SIZE;
|
||||
pte = vmemmap_populate_address(next, node, NULL, NULL);
|
||||
pte = vmemmap_populate_address(next, node, NULL, -1, 0);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -433,7 +545,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
|
||||
*/
|
||||
next += PAGE_SIZE;
|
||||
rc = vmemmap_populate_range(next, last, node, NULL,
|
||||
pte_page(ptep_get(pte)));
|
||||
pte_pfn(ptep_get(pte)),
|
||||
VMEMMAP_POPULATE_PAGEREF);
|
||||
if (rc)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user