mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Pull MM updates from Andrew Morton: - "maple_tree: Replace big node with maple copy" (Liam Howlett) Mainly prepararatory work for ongoing development but it does reduce stack usage and is an improvement. - "mm, swap: swap table phase III: remove swap_map" (Kairui Song) Offers memory savings by removing the static swap_map. It also yields some CPU savings and implements several cleanups. - "mm: memfd_luo: preserve file seals" (Pratyush Yadav) File seal preservation to LUO's memfd code - "mm: zswap: add per-memcg stat for incompressible pages" (Jiayuan Chen) Additional userspace stats reportng to zswap - "arch, mm: consolidate empty_zero_page" (Mike Rapoport) Some cleanups for our handling of ZERO_PAGE() and zero_pfn - "mm/kmemleak: Improve scan_should_stop() implementation" (Zhongqiu Han) A robustness improvement and some cleanups in the kmemleak code - "Improve khugepaged scan logic" (Vernon Yang) Improve khugepaged scan logic and reduce CPU consumption by prioritizing scanning tasks that access memory frequently - "Make KHO Stateless" (Jason Miu) Simplify Kexec Handover by transitioning KHO from an xarray-based metadata tracking system with serialization to a radix tree data structure that can be passed directly to the next kernel - "mm: vmscan: add PID and cgroup ID to vmscan tracepoints" (Thomas Ballasi and Steven Rostedt) Enhance vmscan's tracepointing - "mm: arch/shstk: Common shadow stack mapping helper and VM_NOHUGEPAGE" (Catalin Marinas) Cleanup for the shadow stack code: remove per-arch code in favour of a generic implementation - "Fix KASAN support for KHO restored vmalloc regions" (Pasha Tatashin) Fix a WARN() which can be emitted the KHO restores a vmalloc area - "mm: Remove stray references to pagevec" (Tal Zussman) Several cleanups, mainly udpating references to "struct pagevec", which became folio_batch three years ago - "mm: Eliminate fake head pages from vmemmap optimization" (Kiryl Shutsemau) Simplify the HugeTLB vmemmap optimization (HVO) by changing how tail pages encode their relationship to the head page - "mm/damon/core: improve DAMOS quota efficiency for core layer filters" (SeongJae Park) Improve two problematic behaviors of DAMOS that makes it less efficient when core layer filters are used - "mm/damon: strictly respect min_nr_regions" (SeongJae Park) Improve DAMON usability by extending the treatment of the min_nr_regions user-settable parameter - "mm/page_alloc: pcp locking cleanup" (Vlastimil Babka) The proper fix for a previously hotfixed SMP=n issue. Code simplifications and cleanups ensued - "mm: cleanups around unmapping / zapping" (David Hildenbrand) A bunch of cleanups around unmapping and zapping. Mostly simplifications, code movements, documentation and renaming of zapping functions - "support batched checking of the young flag for MGLRU" (Baolin Wang) Batched checking of the young flag for MGLRU. It's part cleanups; one benchmark shows large performance benefits for arm64 - "memcg: obj stock and slab stat caching cleanups" (Johannes Weiner) memcg cleanup and robustness improvements - "Allow order zero pages in page reporting" (Yuvraj Sakshith) Enhance free page reporting - it is presently and undesirably order-0 pages when reporting free memory. - "mm: vma flag tweaks" (Lorenzo Stoakes) Cleanup work following from the recent conversion of the VMA flags to a bitmap - "mm/damon: add optional debugging-purpose sanity checks" (SeongJae Park) Add some more developer-facing debug checks into DAMON core - "mm/damon: test and document power-of-2 min_region_sz requirement" (SeongJae Park) An additional DAMON kunit test and makes some adjustments to the addr_unit parameter handling - "mm/damon/core: make passed_sample_intervals comparisons overflow-safe" (SeongJae Park) Fix a hard-to-hit time overflow issue in DAMON core - "mm/damon: improve/fixup/update ratio calculation, test and documentation" (SeongJae Park) A batch of misc/minor improvements and fixups for DAMON - "mm: move vma_(kernel|mmu)_pagesize() out of hugetlb.c" (David Hildenbrand) Fix a possible issue with dax-device when CONFIG_HUGETLB=n. Some code movement was required. - "zram: recompression cleanups and tweaks" (Sergey Senozhatsky) A somewhat random mix of fixups, recompression cleanups and improvements in the zram code - "mm/damon: support multiple goal-based quota tuning algorithms" (SeongJae Park) Extend DAMOS quotas goal auto-tuning to support multiple tuning algorithms that users can select - "mm: thp: reduce unnecessary start_stop_khugepaged()" (Breno Leitao) Fix the khugpaged sysfs handling so we no longer spam the logs with reams of junk when starting/stopping khugepaged - "mm: improve map count checks" (Lorenzo Stoakes) Provide some cleanups and slight fixes in the mremap, mmap and vma code - "mm/damon: support addr_unit on default monitoring targets for modules" (SeongJae Park) Extend the use of DAMON core's addr_unit tunable - "mm: khugepaged cleanups and mTHP prerequisites" (Nico Pache) Cleanups to khugepaged and is a base for Nico's planned khugepaged mTHP support - "mm: memory hot(un)plug and SPARSEMEM cleanups" (David Hildenbrand) Code movement and cleanups in the memhotplug and sparsemem code - "mm: remove CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE and cleanup CONFIG_MIGRATION" (David Hildenbrand) Rationalize some memhotplug Kconfig support - "change young flag check functions to return bool" (Baolin Wang) Cleanups to change all young flag check functions to return bool - "mm/damon/sysfs: fix memory leak and NULL dereference issues" (Josh Law and SeongJae Park) Fix a few potential DAMON bugs - "mm/vma: convert vm_flags_t to vma_flags_t in vma code" (Lorenzo Stoakes) Convert a lot of the existing use of the legacy vm_flags_t data type to the new vma_flags_t type which replaces it. Mainly in the vma code. - "mm: expand mmap_prepare functionality and usage" (Lorenzo Stoakes) Expand the mmap_prepare functionality, which is intended to replace the deprecated f_op->mmap hook which has been the source of bugs and security issues for some time. Cleanups, documentation, extension of mmap_prepare into filesystem drivers - "mm/huge_memory: refactor zap_huge_pmd()" (Lorenzo Stoakes) Simplify and clean up zap_huge_pmd(). Additional cleanups around vm_normal_folio_pmd() and the softleaf functionality are performed. * tag 'mm-stable-2026-04-13-21-45' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm: fix deferred split queue races during migration mm/khugepaged: fix issue with tracking lock mm/huge_memory: add and use has_deposited_pgtable() mm/huge_memory: add and use normal_or_softleaf_folio_pmd() mm: add softleaf_is_valid_pmd_entry(), pmd_to_softleaf_folio() mm/huge_memory: separate out the folio part of zap_huge_pmd() mm/huge_memory: use mm instead of tlb->mm mm/huge_memory: remove unnecessary sanity checks mm/huge_memory: deduplicate zap deposited table call mm/huge_memory: remove unnecessary VM_BUG_ON_PAGE() mm/huge_memory: add a common exit path to zap_huge_pmd() mm/huge_memory: handle buggy PMD entry in zap_huge_pmd() mm/huge_memory: have zap_huge_pmd return a boolean, add kdoc mm/huge: avoid big else branch in zap_huge_pmd() mm/huge_memory: simplify vma_is_specal_huge() mm: on remap assert that input range within the proposed VMA mm: add mmap_action_map_kernel_pages[_full]() uio: replace deprecated mmap hook with mmap_prepare in uio_info drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare mm: allow handling of stacked mmap_prepare hooks in more drivers ...
651 lines
17 KiB
C
651 lines
17 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/mm_types.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/memremap.h>
|
|
#include <linux/pkeys.h>
|
|
#include <linux/debugfs.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/page_table_check.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/trace.h>
|
|
#include <asm/powernv.h>
|
|
#include <asm/firmware.h>
|
|
#include <asm/ultravisor.h>
|
|
#include <asm/kexec.h>
|
|
|
|
#include <mm/mmu_decl.h>
|
|
#include <trace/events/thp.h>
|
|
|
|
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
|
|
EXPORT_SYMBOL_GPL(mmu_psize_defs);
|
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
int mmu_vmemmap_psize = MMU_PAGE_4K;
|
|
#endif
|
|
|
|
unsigned long __pmd_frag_nr;
|
|
EXPORT_SYMBOL(__pmd_frag_nr);
|
|
unsigned long __pmd_frag_size_shift;
|
|
EXPORT_SYMBOL(__pmd_frag_size_shift);
|
|
|
|
#ifdef CONFIG_KFENCE
|
|
extern bool kfence_early_init;
|
|
static int __init parse_kfence_early_init(char *arg)
|
|
{
|
|
int val;
|
|
|
|
if (get_option(&arg, &val))
|
|
kfence_early_init = !!val;
|
|
return 0;
|
|
}
|
|
early_param("kfence.sample_interval", parse_kfence_early_init);
|
|
#endif
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
/*
|
|
* This is called when relaxing access to a hugepage. It's also called in the page
|
|
* fault path when we don't hit any of the major fault cases, ie, a minor
|
|
* update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
|
|
* handled those two for us, we additionally deal with missing execute
|
|
* permission here on some processors
|
|
*/
|
|
int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
|
|
pmd_t *pmdp, pmd_t entry, int dirty)
|
|
{
|
|
int changed;
|
|
#ifdef CONFIG_DEBUG_VM
|
|
WARN_ON(!pmd_trans_huge(*pmdp));
|
|
assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
|
|
#endif
|
|
changed = !pmd_same(*(pmdp), entry);
|
|
if (changed) {
|
|
/*
|
|
* We can use MMU_PAGE_2M here, because only radix
|
|
* path look at the psize.
|
|
*/
|
|
__ptep_set_access_flags(vma, pmdp_ptep(pmdp),
|
|
pmd_pte(entry), address, MMU_PAGE_2M);
|
|
}
|
|
return changed;
|
|
}
|
|
|
|
int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
|
|
pud_t *pudp, pud_t entry, int dirty)
|
|
{
|
|
int changed;
|
|
#ifdef CONFIG_DEBUG_VM
|
|
assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
|
|
#endif
|
|
changed = !pud_same(*(pudp), entry);
|
|
if (changed) {
|
|
/*
|
|
* We can use MMU_PAGE_1G here, because only radix
|
|
* path look at the psize.
|
|
*/
|
|
__ptep_set_access_flags(vma, pudp_ptep(pudp),
|
|
pud_pte(entry), address, MMU_PAGE_1G);
|
|
}
|
|
return changed;
|
|
}
|
|
|
|
|
|
bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmdp)
|
|
{
|
|
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
|
|
}
|
|
|
|
bool pudp_test_and_clear_young(struct vm_area_struct *vma,
|
|
unsigned long address, pud_t *pudp)
|
|
{
|
|
return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
|
|
}
|
|
|
|
/*
|
|
* set a new huge pmd. We should not be called for updating
|
|
* an existing pmd entry. That should go via pmd_hugepage_update.
|
|
*/
|
|
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
|
|
pmd_t *pmdp, pmd_t pmd)
|
|
{
|
|
#ifdef CONFIG_DEBUG_VM
|
|
/*
|
|
* Make sure hardware valid bit is not set. We don't do
|
|
* tlb flush for this update.
|
|
*/
|
|
|
|
WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
|
|
assert_spin_locked(pmd_lockptr(mm, pmdp));
|
|
WARN_ON(!(pmd_leaf(pmd)));
|
|
#endif
|
|
trace_hugepage_set_pmd(addr, pmd_val(pmd));
|
|
page_table_check_pmd_set(mm, addr, pmdp, pmd);
|
|
return set_pte_at_unchecked(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
|
|
}
|
|
|
|
void set_pud_at(struct mm_struct *mm, unsigned long addr,
|
|
pud_t *pudp, pud_t pud)
|
|
{
|
|
#ifdef CONFIG_DEBUG_VM
|
|
/*
|
|
* Make sure hardware valid bit is not set. We don't do
|
|
* tlb flush for this update.
|
|
*/
|
|
|
|
WARN_ON(pte_hw_valid(pud_pte(*pudp)));
|
|
assert_spin_locked(pud_lockptr(mm, pudp));
|
|
WARN_ON(!(pud_leaf(pud)));
|
|
#endif
|
|
trace_hugepage_set_pud(addr, pud_val(pud));
|
|
page_table_check_pud_set(mm, addr, pudp, pud);
|
|
return set_pte_at_unchecked(mm, addr, pudp_ptep(pudp), pud_pte(pud));
|
|
}
|
|
|
|
/*
|
|
* We use this to invalidate a pmdp entry before switching from a
|
|
* hugepte to regular pmd entry.
|
|
*/
|
|
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
|
|
pmd_t *pmdp)
|
|
{
|
|
pmd_t old_pmd;
|
|
|
|
VM_WARN_ON_ONCE(!pmd_present(*pmdp));
|
|
old_pmd = __pmd(pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID));
|
|
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
|
|
page_table_check_pmd_clear(vma->vm_mm, address, old_pmd);
|
|
|
|
return old_pmd;
|
|
}
|
|
|
|
pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
|
|
pud_t *pudp)
|
|
{
|
|
pud_t old_pud;
|
|
|
|
VM_WARN_ON_ONCE(!pud_present(*pudp));
|
|
old_pud = __pud(pud_hugepage_update(vma->vm_mm, address, pudp, _PAGE_PRESENT, _PAGE_INVALID));
|
|
flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
|
|
page_table_check_pud_clear(vma->vm_mm, address, old_pud);
|
|
|
|
return old_pud;
|
|
}
|
|
|
|
pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
|
|
unsigned long addr, pmd_t *pmdp, int full)
|
|
{
|
|
pmd_t pmd;
|
|
bool was_present = pmd_present(*pmdp);
|
|
|
|
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
|
|
VM_BUG_ON(was_present && !pmd_trans_huge(*pmdp));
|
|
/*
|
|
* Check pmdp_huge_get_and_clear() for non-present pmd case.
|
|
*/
|
|
pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
|
|
/*
|
|
* if it not a fullmm flush, then we can possibly end up converting
|
|
* this PMD pte entry to a regular level 0 PTE by a parallel page fault.
|
|
* Make sure we flush the tlb in this case. TLB flush not needed for
|
|
* non-present case.
|
|
*/
|
|
if (was_present && !full)
|
|
flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
|
|
return pmd;
|
|
}
|
|
|
|
pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
|
|
unsigned long addr, pud_t *pudp, int full)
|
|
{
|
|
pud_t pud;
|
|
|
|
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
|
|
VM_BUG_ON(!pud_present(*pudp));
|
|
pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
|
|
/*
|
|
* if it not a fullmm flush, then we can possibly end up converting
|
|
* this PMD pte entry to a regular level 0 PTE by a parallel page fault.
|
|
* Make sure we flush the tlb in this case.
|
|
*/
|
|
if (!full)
|
|
flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
|
|
return pud;
|
|
}
|
|
|
|
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
|
|
{
|
|
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
|
|
}
|
|
|
|
static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
|
|
{
|
|
return __pud(pud_val(pud) | pgprot_val(pgprot));
|
|
}
|
|
|
|
/*
|
|
* At some point we should be able to get rid of
|
|
* pmd_mkhuge() and mk_huge_pmd() when we update all the
|
|
* other archs to mark the pmd huge in pfn_pmd()
|
|
*/
|
|
pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
|
|
{
|
|
unsigned long pmdv;
|
|
|
|
pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
|
|
|
|
return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
|
|
}
|
|
|
|
pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
|
|
{
|
|
unsigned long pudv;
|
|
|
|
pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
|
|
|
|
return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
|
|
}
|
|
|
|
pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
|
|
{
|
|
unsigned long pmdv;
|
|
|
|
pmdv = pmd_val(pmd);
|
|
pmdv &= _HPAGE_CHG_MASK;
|
|
return pmd_set_protbits(__pmd(pmdv), newprot);
|
|
}
|
|
|
|
pud_t pud_modify(pud_t pud, pgprot_t newprot)
|
|
{
|
|
unsigned long pudv;
|
|
|
|
pudv = pud_val(pud);
|
|
pudv &= _HPAGE_CHG_MASK;
|
|
return pud_set_protbits(__pud(pudv), newprot);
|
|
}
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
/* For use by kexec, called with MMU off */
|
|
notrace void mmu_cleanup_all(void)
|
|
{
|
|
if (radix_enabled())
|
|
radix__mmu_cleanup_all();
|
|
else if (mmu_hash_ops.hpte_clear_all)
|
|
mmu_hash_ops.hpte_clear_all();
|
|
|
|
reset_sprs();
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
int __meminit create_section_mapping(unsigned long start, unsigned long end,
|
|
int nid, pgprot_t prot)
|
|
{
|
|
if (radix_enabled())
|
|
return radix__create_section_mapping(start, end, nid, prot);
|
|
|
|
return hash__create_section_mapping(start, end, nid, prot);
|
|
}
|
|
|
|
int __meminit remove_section_mapping(unsigned long start, unsigned long end)
|
|
{
|
|
if (radix_enabled())
|
|
return radix__remove_section_mapping(start, end);
|
|
|
|
return hash__remove_section_mapping(start, end);
|
|
}
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
|
|
|
void __init mmu_partition_table_init(void)
|
|
{
|
|
unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
|
|
unsigned long ptcr;
|
|
|
|
/* Initialize the Partition Table with no entries */
|
|
partition_tb = memblock_alloc_or_panic(patb_size, patb_size);
|
|
ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
|
|
set_ptcr_when_no_uv(ptcr);
|
|
powernv_set_nmmu_ptcr(ptcr);
|
|
}
|
|
|
|
static void flush_partition(unsigned int lpid, bool radix)
|
|
{
|
|
if (radix) {
|
|
radix__flush_all_lpid(lpid);
|
|
radix__flush_all_lpid_guest(lpid);
|
|
} else {
|
|
asm volatile("ptesync" : : : "memory");
|
|
asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
|
|
"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
|
|
/* do we need fixup here ?*/
|
|
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
|
|
trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
|
|
}
|
|
}
|
|
|
|
void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
|
|
unsigned long dw1, bool flush)
|
|
{
|
|
unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
|
|
|
|
/*
|
|
* When ultravisor is enabled, the partition table is stored in secure
|
|
* memory and can only be accessed doing an ultravisor call. However, we
|
|
* maintain a copy of the partition table in normal memory to allow Nest
|
|
* MMU translations to occur (for normal VMs).
|
|
*
|
|
* Therefore, here we always update partition_tb, regardless of whether
|
|
* we are running under an ultravisor or not.
|
|
*/
|
|
partition_tb[lpid].patb0 = cpu_to_be64(dw0);
|
|
partition_tb[lpid].patb1 = cpu_to_be64(dw1);
|
|
|
|
/*
|
|
* If ultravisor is enabled, we do an ultravisor call to register the
|
|
* partition table entry (PATE), which also do a global flush of TLBs
|
|
* and partition table caches for the lpid. Otherwise, just do the
|
|
* flush. The type of flush (hash or radix) depends on what the previous
|
|
* use of the partition ID was, not the new use.
|
|
*/
|
|
if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
|
|
uv_register_pate(lpid, dw0, dw1);
|
|
pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
|
|
dw0, dw1);
|
|
} else if (flush) {
|
|
/*
|
|
* Boot does not need to flush, because MMU is off and each
|
|
* CPU does a tlbiel_all() before switching them on, which
|
|
* flushes everything.
|
|
*/
|
|
flush_partition(lpid, (old & PATB_HR));
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
|
|
|
|
static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
|
|
{
|
|
void *pmd_frag, *ret;
|
|
|
|
if (PMD_FRAG_NR == 1)
|
|
return NULL;
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
ret = mm->context.pmd_frag;
|
|
if (ret) {
|
|
pmd_frag = ret + PMD_FRAG_SIZE;
|
|
/*
|
|
* If we have taken up all the fragments mark PTE page NULL
|
|
*/
|
|
if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
|
|
pmd_frag = NULL;
|
|
mm->context.pmd_frag = pmd_frag;
|
|
}
|
|
spin_unlock(&mm->page_table_lock);
|
|
return (pmd_t *)ret;
|
|
}
|
|
|
|
static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
|
|
{
|
|
void *ret = NULL;
|
|
struct ptdesc *ptdesc;
|
|
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
|
|
|
|
if (mm == &init_mm)
|
|
gfp &= ~__GFP_ACCOUNT;
|
|
ptdesc = pagetable_alloc(gfp, 0);
|
|
if (!ptdesc)
|
|
return NULL;
|
|
if (!pagetable_pmd_ctor(mm, ptdesc)) {
|
|
pagetable_free(ptdesc);
|
|
return NULL;
|
|
}
|
|
|
|
atomic_set(&ptdesc->pt_frag_refcount, 1);
|
|
|
|
ret = ptdesc_address(ptdesc);
|
|
/*
|
|
* if we support only one fragment just return the
|
|
* allocated page.
|
|
*/
|
|
if (PMD_FRAG_NR == 1)
|
|
return ret;
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
/*
|
|
* If we find ptdesc_page set, we return
|
|
* the allocated page with single fragment
|
|
* count.
|
|
*/
|
|
if (likely(!mm->context.pmd_frag)) {
|
|
atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
|
|
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
|
|
}
|
|
spin_unlock(&mm->page_table_lock);
|
|
|
|
return (pmd_t *)ret;
|
|
}
|
|
|
|
pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
|
|
{
|
|
pmd_t *pmd;
|
|
|
|
pmd = get_pmd_from_cache(mm);
|
|
if (pmd)
|
|
return pmd;
|
|
|
|
return __alloc_for_pmdcache(mm);
|
|
}
|
|
|
|
void pmd_fragment_free(unsigned long *pmd)
|
|
{
|
|
struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
|
|
|
|
if (pagetable_is_reserved(ptdesc))
|
|
return free_reserved_ptdesc(ptdesc);
|
|
|
|
BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
|
|
if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
|
|
pagetable_dtor(ptdesc);
|
|
pagetable_free(ptdesc);
|
|
}
|
|
}
|
|
|
|
static inline void pgtable_free(void *table, int index)
|
|
{
|
|
switch (index) {
|
|
case PTE_INDEX:
|
|
pte_fragment_free(table, 0);
|
|
break;
|
|
case PMD_INDEX:
|
|
pmd_fragment_free(table);
|
|
break;
|
|
case PUD_INDEX:
|
|
__pud_free(table);
|
|
break;
|
|
/* We don't free pgd table via RCU callback */
|
|
default:
|
|
BUG();
|
|
}
|
|
}
|
|
|
|
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
|
|
{
|
|
unsigned long pgf = (unsigned long)table;
|
|
|
|
BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
|
|
pgf |= index;
|
|
tlb_remove_table(tlb, (void *)pgf);
|
|
}
|
|
|
|
void __tlb_remove_table(void *_table)
|
|
{
|
|
void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
|
|
unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
|
|
|
|
return pgtable_free(table, index);
|
|
}
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
|
|
|
|
void arch_report_meminfo(struct seq_file *m)
|
|
{
|
|
seq_printf(m, "DirectMap4k: %8lu kB\n",
|
|
atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
|
|
seq_printf(m, "DirectMap64k: %8lu kB\n",
|
|
atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
|
|
if (radix_enabled()) {
|
|
seq_printf(m, "DirectMap2M: %8lu kB\n",
|
|
atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
|
|
seq_printf(m, "DirectMap1G: %8lu kB\n",
|
|
atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
|
|
} else {
|
|
seq_printf(m, "DirectMap16M: %8lu kB\n",
|
|
atomic_long_read(&direct_pages_count[MMU_PAGE_16M]) << 14);
|
|
seq_printf(m, "DirectMap16G: %8lu kB\n",
|
|
atomic_long_read(&direct_pages_count[MMU_PAGE_16G]) << 24);
|
|
}
|
|
}
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
|
|
pte_t *ptep)
|
|
{
|
|
unsigned long pte_val;
|
|
|
|
/*
|
|
* Clear the _PAGE_PRESENT so that no hardware parallel update is
|
|
* possible. Also keep the pte_present true so that we don't take
|
|
* wrong fault.
|
|
*/
|
|
pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
|
|
|
|
return __pte(pte_val);
|
|
|
|
}
|
|
|
|
void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
|
|
pte_t *ptep, pte_t old_pte, pte_t pte)
|
|
{
|
|
if (radix_enabled())
|
|
return radix__ptep_modify_prot_commit(vma, addr,
|
|
ptep, old_pte, pte);
|
|
set_pte_at_unchecked(vma->vm_mm, addr, ptep, pte);
|
|
}
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
/*
|
|
* For hash translation mode, we use the deposited table to store hash slot
|
|
* information and they are stored at PTRS_PER_PMD offset from related pmd
|
|
* location. Hence a pmd move requires deposit and withdraw.
|
|
*
|
|
* For radix translation with split pmd ptl, we store the deposited table in the
|
|
* pmd page. Hence if we have different pmd page we need to withdraw during pmd
|
|
* move.
|
|
*
|
|
* With hash we use deposited table always irrespective of anon or not.
|
|
* With radix we use deposited table only for anonymous mapping.
|
|
*/
|
|
int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
|
|
struct spinlock *old_pmd_ptl,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
if (radix_enabled())
|
|
return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
|
|
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Does the CPU support tlbie?
|
|
*/
|
|
bool tlbie_capable __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE);
|
|
EXPORT_SYMBOL(tlbie_capable);
|
|
|
|
/*
|
|
* Should tlbie be used for management of CPU TLBs, for kernel and process
|
|
* address spaces? tlbie may still be used for nMMU accelerators, and for KVM
|
|
* guest address spaces.
|
|
*/
|
|
bool tlbie_enabled __read_mostly = IS_ENABLED(CONFIG_PPC_RADIX_BROADCAST_TLBIE);
|
|
|
|
static int __init setup_disable_tlbie(char *str)
|
|
{
|
|
if (!radix_enabled()) {
|
|
pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
|
|
return 1;
|
|
}
|
|
|
|
tlbie_capable = false;
|
|
tlbie_enabled = false;
|
|
|
|
return 1;
|
|
}
|
|
__setup("disable_tlbie", setup_disable_tlbie);
|
|
|
|
static int __init pgtable_debugfs_setup(void)
|
|
{
|
|
if (!tlbie_capable)
|
|
return 0;
|
|
|
|
/*
|
|
* There is no locking vs tlb flushing when changing this value.
|
|
* The tlb flushers will see one value or another, and use either
|
|
* tlbie or tlbiel with IPIs. In both cases the TLBs will be
|
|
* invalidated as expected.
|
|
*/
|
|
debugfs_create_bool("tlbie_enabled", 0600,
|
|
arch_debugfs_dir,
|
|
&tlbie_enabled);
|
|
|
|
return 0;
|
|
}
|
|
arch_initcall(pgtable_debugfs_setup);
|
|
|
|
#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN)
|
|
/*
|
|
* Override the generic version in mm/memremap.c.
|
|
*
|
|
* With hash translation, the direct-map range is mapped with just one
|
|
* page size selected by htab_init_page_sizes(). Consult
|
|
* mmu_psize_defs[] to determine the minimum page size alignment.
|
|
*/
|
|
unsigned long memremap_compat_align(void)
|
|
{
|
|
if (!radix_enabled()) {
|
|
unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
|
|
return max(SUBSECTION_SIZE, 1UL << shift);
|
|
}
|
|
|
|
return SUBSECTION_SIZE;
|
|
}
|
|
EXPORT_SYMBOL_GPL(memremap_compat_align);
|
|
#endif
|
|
|
|
pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
|
|
{
|
|
unsigned long prot;
|
|
|
|
/* Radix supports execute-only, but protection_map maps X -> RX */
|
|
if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC))
|
|
vm_flags |= VM_READ;
|
|
|
|
prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]);
|
|
|
|
if (vm_flags & VM_SAO)
|
|
prot |= _PAGE_SAO;
|
|
|
|
#ifdef CONFIG_PPC_MEM_KEYS
|
|
prot |= vmflag_to_pte_pkey_bits(vm_flags);
|
|
#endif
|
|
|
|
return __pgprot(prot);
|
|
}
|
|
EXPORT_SYMBOL(vm_get_page_prot);
|