From 396f57b5720024638dbb503f6a4abd988a49d815 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 16 Feb 2026 22:58:02 +0800 Subject: [PATCH 001/369] mm, swap: speed up hibernation allocation and writeout Since commit 0ff67f990bd4 ("mm, swap: remove swap slot cache"), hibernation has been using the swap slot slow allocation path for simplification, which turns out might cause regression for some devices because the allocator now rotates clusters too often, leading to slower allocation and more random distribution of data. Fast allocation is not complex, so implement hibernation support as well. Test result with Samsung SSD 830 Series (SATA II, 3.0 Gbps) shows the performance is several times better [1]: 6.19: 324 seconds After this series: 35 seconds Link: https://lkml.kernel.org/r/20260216-hibernate-perf-v4-1-1ba9f0bf1ec9@tencent.com Link: https://lore.kernel.org/linux-mm/8b4bdcfa-ce3f-4e23-839f-31367df7c18f@gmx.de/ [1] Signed-off-by: Kairui Song Fixes: 0ff67f990bd4 ("mm, swap: remove swap slot cache") Reported-by: Carsten Grohmann Closes: https://lore.kernel.org/linux-mm/20260206121151.dea3633d1f0ded7bbf49c22e@linux-foundation.org/ Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kemeng Shi Cc: Nhat Pham Cc: Signed-off-by: Andrew Morton --- mm/swapfile.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 94af29d1de88..90132b74d6a0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1926,8 +1926,9 @@ out: /* Allocate a slot for hibernation */ swp_entry_t swap_alloc_hibernation_slot(int type) { - struct swap_info_struct *si = swap_type_to_info(type); - unsigned long offset; + struct swap_info_struct *pcp_si, *si = swap_type_to_info(type); + unsigned long pcp_offset, offset = SWAP_ENTRY_INVALID; + struct swap_cluster_info *ci; swp_entry_t entry = {0}; if (!si) @@ -1937,11 +1938,21 @@ swp_entry_t swap_alloc_hibernation_slot(int type) if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { /* - * Grab the local lock to be compliant - * with swap table allocation. + * Try the local cluster first if it matches the device. If + * not, try grab a new cluster and override local cluster. */ local_lock(&percpu_swap_cluster.lock); - offset = cluster_alloc_swap_entry(si, NULL); + pcp_si = this_cpu_read(percpu_swap_cluster.si[0]); + pcp_offset = this_cpu_read(percpu_swap_cluster.offset[0]); + if (pcp_si == si && pcp_offset) { + ci = swap_cluster_lock(si, pcp_offset); + if (cluster_is_usable(ci, 0)) + offset = alloc_swap_scan_cluster(si, ci, NULL, pcp_offset); + else + swap_cluster_unlock(ci); + } + if (!offset) + offset = cluster_alloc_swap_entry(si, NULL); local_unlock(&percpu_swap_cluster.lock); if (offset) entry = swp_entry(si->type, offset); From 7e74dd031620549174eb287649aa12970ef56589 Mon Sep 17 00:00:00 2001 From: Shengming Hu Date: Thu, 29 Jan 2026 22:38:14 +0800 Subject: [PATCH 002/369] mm/page_alloc: avoid overcounting bulk alloc in watermark check alloc_pages_bulk_noprof() only fills NULL slots and already tracks how many entries are pre-populated via nr_populated. The fast watermark check was adding nr_pages unconditionally, which can overestimate the demand. Use (nr_pages - nr_populated) instead, as an upper bound on the remaining pages this call can still allocate without scanning the whole array. Link: https://lkml.kernel.org/r/tencent_F36C5B5FB4DED98C79D9BDEE1210CD338C06@qq.com Signed-off-by: Shengming Hu Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2d4b6f1a554e..d88c8c67ac0b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5136,7 +5136,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, cond_accept_memory(zone, 0, alloc_flags); retry_this_zone: - mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages - nr_populated; if (zone_watermark_fast(zone, 0, mark, zonelist_zone_idx(ac.preferred_zoneref), alloc_flags, gfp)) { From b8a4b088381875ff1a93c0c2d5a926b30b1bfe31 Mon Sep 17 00:00:00 2001 From: qinyu Date: Tue, 3 Feb 2026 17:54:00 +0800 Subject: [PATCH 003/369] mm/damon/ops-common: remove redudnant mmu notifier call in pmdp mkold Currently, mmu_notifier_clear_young() is called immediately after pmdp_clear_young_notify(), which already calls mmu_notifier_clear_young() internally. This results in a redundant notifier call. Replace pmdp_clear_young_notify() with the non-notify variant to avoid the duplicate call and make the pmdp path consistent with the corresponding ptep_mkold() code. Link: https://lkml.kernel.org/r/20260203095400.2465255-1-qin.yuA@h3c.com Signed-off-by: qinyu Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index a218d9922234..8c6d613425c1 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -90,7 +90,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr return; if (likely(pmd_present(pmdval))) - young |= pmdp_clear_young_notify(vma, addr, pmd); + young |= pmdp_test_and_clear_young(vma, addr, pmd); young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE); if (young) folio_set_young(folio); From 260d70819c37cf59383286f8dc6566eaa32776bb Mon Sep 17 00:00:00 2001 From: Altan Hacigumus Date: Tue, 3 Feb 2026 19:35:53 -0800 Subject: [PATCH 004/369] mm/shrinker: fix refcount leak in shrink_slab_memcg() When kmem is disabled for memcg, slab-backed shrinkers are skipped. However, shrink_slab_memcg() doesn't drop the reference acquired via shrinker_try_get() before continuing. Add the missing shrinker_put(). Also, since memcg_kmem_online() and shrinker flags cannot change dynamically, remove the shrinker from the bitmap to avoid unnecessary future scans. Link: https://lkml.kernel.org/r/20260204033553.50039-1-ahacigu.linux@gmail.com Fixes: 50d09da8e119 ("mm: shrinker: make memcg slab shrink lockless") Signed-off-by: Altan Hacigumus Acked-by: Qi Zheng Link: https://lore.kernel.org/r/20260203073757.135088-1-ahacigu.linux@gmail.com Reviewed-by: Muchun Song Cc: Dave Chinner Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/shrinker.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index 7b61fc0ee78f..94646ee0af63 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -544,8 +544,11 @@ again: /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) + !(shrinker->flags & SHRINKER_NONSLAB)) { + clear_bit(offset, unit->map); + shrinker_put(shrinker); continue; + } ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { From 34ca46cc6fc954782f859f0fc66e8fe9ce2c24f0 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Wed, 4 Feb 2026 21:47:41 +0000 Subject: [PATCH 005/369] fs: hugetlb: simplify remove_inode_hugepages() return type When remove_inode_hugepages() was introduced in commit c86272287bc6 ("hugetlb: create remove_inode_single_folio to remove single file folio") it used to return a boolean to indicate if it bailed out due to race with page faults. However, since the race is already solved by [1], remove_inode_hugepages() doesn't have any path to return false anymore. Simplify remove_inode_hugepages() return type to void, remove the unnecessary ret variable, and adjust the call site in remove_inode_hugepages(). No functional change in this commit. Link: https://lkml.kernel.org/r/20260204214741.3161520-1-jiaqiyan@google.com Link: https://lore.kernel.org/all/20220914221810.95771-10-mike.kravetz@oracle.com [1] Signed-off-by: Jiaqi Yan Suggested-by: Jane Chu Reviewed-by: Jane Chu Reviewed-by: Muchun Song Acked-by: David Hildenbrand (arm) Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3f70c47981de..22c799000edb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -513,15 +513,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, /* * Called with hugetlb fault mutex held. - * Returns true if page was actually removed, false otherwise. */ -static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, - struct address_space *mapping, - struct folio *folio, pgoff_t index, - bool truncate_op) +static void remove_inode_single_folio(struct hstate *h, struct inode *inode, + struct address_space *mapping, struct folio *folio, + pgoff_t index, bool truncate_op) { - bool ret = false; - /* * If folio is mapped, it was faulted in after being * unmapped in caller or hugetlb_vmdelete_list() skips @@ -543,7 +539,6 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, */ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); hugetlb_delete_from_page_cache(folio); - ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) @@ -551,7 +546,6 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, } folio_unlock(folio); - return ret; } /* @@ -599,9 +593,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, /* * Remove folio that was part of folio_batch. */ - if (remove_inode_single_folio(h, inode, mapping, folio, - index, truncate_op)) - freed++; + remove_inode_single_folio(h, inode, mapping, folio, + index, truncate_op); + freed++; mutex_unlock(&hugetlb_fault_mutex_table[hash]); } From 318d87b8fa733bebdc9c803657671df0b7b8b006 Mon Sep 17 00:00:00 2001 From: xu xin Date: Thu, 12 Feb 2026 19:29:32 +0800 Subject: [PATCH 006/369] ksm: initialize the addr only once in rmap_walk_ksm This is a minor performance optimization, especially when there are many for-loop iterations, because the addr variable doesn't change across iterations. Therefore, it only needs to be initialized once before the loop. Link: https://lkml.kernel.org/r/20260212192820223O_r2NQzSEPG_C56cs-z4l@zte.com.cn Link: https://lkml.kernel.org/r/20260212192932941MSsJEAyoRW4YdLBN7_myn@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand (Arm) Cc: Chengming Zhou Cc: Hugh Dickins Cc: Wang Yaxin Cc: Yang Yang Signed-off-by: Andrew Morton --- mm/ksm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index bda71ae609ff..2a2f2f005fc3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3168,6 +3168,8 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { + /* Ignore the stable/unstable/sqnr flags */ + const unsigned long addr = rmap_item->address & PAGE_MASK; struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; @@ -3180,16 +3182,13 @@ again: } anon_vma_lock_read(anon_vma); } + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { - unsigned long addr; cond_resched(); vma = vmac->vma; - /* Ignore the stable/unstable/sqnr flags */ - addr = rmap_item->address & PAGE_MASK; - if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* From 0fd66c343ce7fb9bfc2a8ae9f4461e7c610652ad Mon Sep 17 00:00:00 2001 From: Kevin Lourenco Date: Mon, 22 Dec 2025 15:18:17 +0100 Subject: [PATCH 007/369] mm/fadvise: validate offset in generic_fadvise When converted to (u64) for page calculations, a negative offset can produce extremely large page indices. This may lead to issues in certain advice modes (excessive readahead or cache invalidation). Reject negative offsets with -EINVAL for consistent argument validation and to avoid silent misbehavior. POSIX and the man page do not clearly define behavior for negative offset/len. FreeBSD rejects negative offsets as well, so failing with -EINVAL is consistent with existing practice. The man page can be updated separately to document the Linux behavior. Link: https://lkml.kernel.org/r/20260208135738.18992-1-klourencodev@gmail.com Link: https://lkml.kernel.org/r/20251222141817.13335-1-klourencodev@gmail.com Signed-off-by: Kevin Lourenco Acked-by: David Hildenbrand (Arm) Reviewed-by: Jan Kara Cc: Christian Brauner Cc: David Hildenbrand Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/fadvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/fadvise.c b/mm/fadvise.c index 67028e30aa91..b63fe21416ff 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -43,7 +43,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) return -ESPIPE; mapping = file->f_mapping; - if (!mapping || len < 0) + if (!mapping || len < 0 || offset < 0) return -EINVAL; bdi = inode_to_bdi(mapping->host); From 6884832472730e286d476b64882a5f6856625320 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:06 -0500 Subject: [PATCH 008/369] maple_tree: fix mas_dup_alloc() sparse warning Patch series "maple_tree: Replace big node with maple copy", v3. The big node struct was created for simplicity of splitting, rebalancing, and spanning store operations by using a copy buffer to create the data necessary prior to breaking it up into 256B nodes. Certain operations were rather tricky due to the restriction of keeping NULL entries together and never at the end of a node (except the right-most node). The big node struct is incompatible with future features that are currently in development. Specifically different node types and different data type sizes for pivots. The big node struct was also a stack variable, which caused issues with certain configurations of kernel build. This series removes big node by introducing another node type which will never be written to the tree: maple_copy. The maple copy node operates more like a scatter/gather operation with a number of sources and destinations of allocated nodes. The sources are copied to the destinations, in turn, until the sources are exhausted. The destination is changed if it is filled or the split location is reached prior to the source data end. New data is inserted by using the maple copy node itself as a source with up to 3 slots and pivots. The data in the maple copy node is the data being written to the tree along with any fragment of the range(s) being overwritten. As with all nodes, the maple copy node is of size 256B. Using a node type allows for the copy operation to treat the new data stored in the maple copy node the same as any other source node. Analysis of the runtime shows no regression or benefit of removing the larger stack structure. The motivation is the ground work to use new node types and to help those with odd configurations that have had issues. The change was tested by myself using mm_tests on amd64 and by Suren on android (arm64). Limited testing on s390 qemu was also performed using stress-ng on the virtual memory, which should cover many corner cases. This patch (of 30): Use RCU_INIT_POINTER to initialize an rcu pointer to an initial value since there are no readers within the tree being created during duplication. There is no risk of readers seeing the initialized or uninitialized value until after the synchronization call in mas_dup_buld(). Link: https://lkml.kernel.org/r/20260130205935.2559335-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20260130205935.2559335-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5aa4c9500018..0e0158ee7ba5 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6260,8 +6260,15 @@ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; - new_slots[i] = ma_mnode_ptr((unsigned long)mas_pop_node(mas) | - val); + /* + * Warning, see rcu_assign_pointer() documentation. Since this + * is a duplication of a tree, there are no readers walking the + * tree until after the rcu_assign_pointer() call in + * mas_dup_build(). + */ + RCU_INIT_POINTER(new_slots[i], + ma_mnode_ptr((unsigned long)mas_pop_node(mas) | + val)); } } From 3e302560b9b49f76d97f431d7b9de6829344ba87 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:07 -0500 Subject: [PATCH 009/369] maple_tree: move mas_spanning_rebalance loop to function Move the loop over the tree levels to its own function. No intended functional changes. Link: https://lkml.kernel.org/r/20260130205935.2559335-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 108 +++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0e0158ee7ba5..70ad474e6ed1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2595,49 +2595,16 @@ dead_node: return NULL; } -/* - * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. - * @mas: The starting maple state - * @mast: The maple_subtree_state, keeps track of 4 maple states. - * @count: The estimated count of iterations needed. - * - * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root - * is hit. First @b_node is split into two entries which are inserted into the - * next iteration of the loop. @b_node is returned populated with the final - * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the - * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last - * to account of what has been copied into the new sub-tree. The update of - * orig_l_mas->last is used in mas_consume to find the slots that will need to - * be either freed or destroyed. orig_l_mas->depth keeps track of the height of - * the new sub-tree in case the sub-tree becomes the full tree. - */ -static void mas_spanning_rebalance(struct ma_state *mas, +static void mas_spanning_rebalance_loop(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { + unsigned char split, mid_split; unsigned char slot = 0; unsigned char new_height = 0; /* used if node is a new root */ struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; - MA_STATE(l_mas, mas->tree, mas->index, mas->index); - MA_STATE(r_mas, mas->tree, mas->index, mas->last); - MA_STATE(m_mas, mas->tree, mas->index, mas->index); - - /* - * The tree needs to be rebalanced and leaves need to be kept at the same level. - * Rebalancing is done by use of the ``struct maple_topiary``. - */ - mast->l = &l_mas; - mast->m = &m_mas; - mast->r = &r_mas; - l_mas.status = r_mas.status = m_mas.status = ma_none; - - /* Check if this is not root and has sufficient data. */ - if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && - unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) - mast_spanning_rebalance(mast); - /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid @@ -2672,10 +2639,10 @@ static void mas_spanning_rebalance(struct ma_state *mas, mast_ascend(mast); mast_combine_cp_left(mast); - l_mas.offset = mast->bn->b_end; - mab_set_b_end(mast->bn, &l_mas, left); - mab_set_b_end(mast->bn, &m_mas, middle); - mab_set_b_end(mast->bn, &r_mas, right); + mast->l->offset = mast->bn->b_end; + mab_set_b_end(mast->bn, mast->l, left); + mab_set_b_end(mast->bn, mast->m, middle); + mab_set_b_end(mast->bn, mast->r, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); @@ -2708,17 +2675,17 @@ static void mas_spanning_rebalance(struct ma_state *mas, count++; } - l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), + mast->l->node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); - mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); + mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); new_height++; - mas_set_parent(mas, left, l_mas.node, slot); + mas_set_parent(mas, left, mast->l->node, slot); if (middle) - mas_set_parent(mas, middle, l_mas.node, ++slot); + mas_set_parent(mas, middle, mast->l->node, ++slot); if (right) - mas_set_parent(mas, right, l_mas.node, ++slot); + mas_set_parent(mas, right, mast->l->node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: @@ -2726,20 +2693,61 @@ new_root: while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { - mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; + mas_mn(mast->l)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; - mas->depth = l_mas.depth; - mas->node = l_mas.node; - mas->min = l_mas.min; - mas->max = l_mas.max; - mas->offset = l_mas.offset; + mas->depth = mast->l->depth; + mas->node = mast->l->node; + mas->min = mast->l->min; + mas->max = mast->l->max; + mas->offset = mast->l->offset; mas_wmb_replace(mas, old_enode, new_height); mtree_range_walk(mas); return; } +/* + * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. + * @mas: The starting maple state + * @mast: The maple_subtree_state, keeps track of 4 maple states. + * @count: The estimated count of iterations needed. + * + * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root + * is hit. First @b_node is split into two entries which are inserted into the + * next iteration of the loop. @b_node is returned populated with the final + * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the + * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last + * to account of what has been copied into the new sub-tree. The update of + * orig_l_mas->last is used in mas_consume to find the slots that will need to + * be either freed or destroyed. orig_l_mas->depth keeps track of the height of + * the new sub-tree in case the sub-tree becomes the full tree. + */ +static void mas_spanning_rebalance(struct ma_state *mas, + struct maple_subtree_state *mast, unsigned char count) +{ + + MA_STATE(l_mas, mas->tree, mas->index, mas->index); + MA_STATE(r_mas, mas->tree, mas->index, mas->last); + MA_STATE(m_mas, mas->tree, mas->index, mas->index); + + /* + * The tree needs to be rebalanced and leaves need to be kept at the same level. + * Rebalancing is done by use of the ``struct maple_topiary``. + */ + mast->l = &l_mas; + mast->m = &m_mas; + mast->r = &r_mas; + l_mas.status = r_mas.status = m_mas.status = ma_none; + + /* Check if this is not root and has sufficient data. */ + if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && + unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) + mast_spanning_rebalance(mast); + + mas_spanning_rebalance_loop(mas, mast, count); +} + /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state From df11f9ee8f5dbe4d8a0a61c70eb5a5d37d77c8a6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:08 -0500 Subject: [PATCH 010/369] maple_tree: extract use of big node from mas_wr_spanning_store() Isolate big node to use in its own function. No functional changes intended. Link: https://lkml.kernel.org/r/20260130205935.2559335-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 70ad474e6ed1..9ab42821ee2d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2748,6 +2748,30 @@ static void mas_spanning_rebalance(struct ma_state *mas, mas_spanning_rebalance_loop(mas, mast, count); } + +static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, + struct maple_subtree_state *mast, unsigned char height, + struct ma_wr_state *l_wr_mas) +{ + struct maple_big_node b_node; + + memset(&b_node, 0, sizeof(struct maple_big_node)); + /* Copy l_mas and store the value in b_node. */ + mas_store_b_node(l_wr_mas, &b_node, mast->orig_l->end); + /* Copy r_mas into b_node if there is anything to copy. */ + if (mast->orig_r->max > mast->orig_r->last) + mas_mab_cp(mast->orig_r, mast->orig_r->offset, + mast->orig_r->end, &b_node, b_node.b_end + 1); + else + b_node.b_end++; + + /* Stop spanning searches by searching for just index. */ + mast->orig_l->index = mast->orig_l->last = mas->index; + + mast->bn = &b_node; + /* Combine l_mas and r_mas and split them up evenly again. */ + return mas_spanning_rebalance(mas, mast, height); +} /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state @@ -3400,10 +3424,9 @@ done: * span. * @wr_mas: The maple write state */ -static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) +static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; - struct maple_big_node b_node; struct ma_state *mas; unsigned char height; @@ -3467,24 +3490,9 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) return mas_new_root(mas, wr_mas->entry); } - memset(&b_node, 0, sizeof(struct maple_big_node)); - /* Copy l_mas and store the value in b_node. */ - mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); - /* Copy r_mas into b_node if there is anything to copy. */ - if (r_mas.max > r_mas.last) - mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, - &b_node, b_node.b_end + 1); - else - b_node.b_end++; - - /* Stop spanning searches by searching for just index. */ - l_mas.index = l_mas.last = mas->index; - - mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; - /* Combine l_mas and r_mas and split them up evenly again. */ - return mas_spanning_rebalance(mas, &mast, height + 1); + mas_wr_spanning_rebalance(mas, &mast, height + 1, &l_wr_mas); } /* From a2ac9935d3305a558cd16f7fb1521146fe86fe74 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:09 -0500 Subject: [PATCH 011/369] maple_tree: remove unnecessary assignment of orig_l index The index value is already a copy of the maple state so there is no need to set it again. Link: https://lkml.kernel.org/r/20260130205935.2559335-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9ab42821ee2d..1e780427c04a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2766,7 +2766,7 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, b_node.b_end++; /* Stop spanning searches by searching for just index. */ - mast->orig_l->index = mast->orig_l->last = mas->index; + mast->orig_l->last = mas->index; mast->bn = &b_node; /* Combine l_mas and r_mas and split them up evenly again. */ From 6f2e522186cbd06fef61921a55375dfeb7885f2a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:10 -0500 Subject: [PATCH 012/369] maple_tree: inline mas_spanning_rebalance() into mas_wr_spanning_rebalance() Copy the contents of mas_spanning_rebalance() into mas_wr_spanning_rebalance(), in preparation of removing initial big node use. No functional changes intended. Link: https://lkml.kernel.org/r/20260130205935.2559335-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1e780427c04a..fb14ce4a49c3 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2754,6 +2754,9 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, struct ma_wr_state *l_wr_mas) { struct maple_big_node b_node; + MA_STATE(l_mas, mas->tree, mas->index, mas->index); + MA_STATE(r_mas, mas->tree, mas->index, mas->last); + MA_STATE(m_mas, mas->tree, mas->index, mas->index); memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ @@ -2770,7 +2773,22 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, mast->bn = &b_node; /* Combine l_mas and r_mas and split them up evenly again. */ - return mas_spanning_rebalance(mas, mast, height); + + /* + * The tree needs to be rebalanced and leaves need to be kept at the same level. + * Rebalancing is done by use of the ``struct maple_topiary``. + */ + mast->l = &l_mas; + mast->m = &m_mas; + mast->r = &r_mas; + l_mas.status = r_mas.status = m_mas.status = ma_none; + + /* Check if this is not root and has sufficient data. */ + if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && + unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) + mast_spanning_rebalance(mast); + + mas_spanning_rebalance_loop(mas, mast, height); } /* * mas_rebalance() - Rebalance a given node. From 3dd3dbaac179e36861ca065cbbd7b89860ef883a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:11 -0500 Subject: [PATCH 013/369] maple_tree: make ma_wr_states reliable for reuse in spanning store mas_extend_spanning_null() was not modifying the range min and range max of the resulting store operation. The result was that the maple write state no longer matched what the write was doing. This was not an issue as the values were previously not used, but to make the ma_wr_state usable in future changes, the range min/max stored in the ma_wr_state for left and right need to be consistent with the operation. Link: https://lkml.kernel.org/r/20260130205935.2559335-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index fb14ce4a49c3..ab14876bebf7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3319,6 +3319,7 @@ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; + l_wr_mas->r_min = l_mas->index; } if (!r_wr_mas->content) { @@ -3331,6 +3332,7 @@ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; + r_wr_mas->r_max = r_mas->last; } } From 41bcc348f23feacb2b6c16dd0b7aa7d024ce93cc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:12 -0500 Subject: [PATCH 014/369] maple_tree: remove l_wr_mas from mas_wr_spanning_rebalance Use the wr_mas instead of creating another variable on the stack. Take the opportunity to remove l_mas from being used anywhere but in the maple_subtree_state. Link: https://lkml.kernel.org/r/20260130205935.2559335-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ab14876bebf7..afa39bbd687c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2751,7 +2751,7 @@ static void mas_spanning_rebalance(struct ma_state *mas, static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char height, - struct ma_wr_state *l_wr_mas) + struct ma_wr_state *wr_mas) { struct maple_big_node b_node; MA_STATE(l_mas, mas->tree, mas->index, mas->index); @@ -2760,7 +2760,7 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ - mas_store_b_node(l_wr_mas, &b_node, mast->orig_l->end); + mas_store_b_node(wr_mas, &b_node, mast->orig_l->end); /* Copy r_mas into b_node if there is anything to copy. */ if (mast->orig_r->max > mast->orig_r->last) mas_mab_cp(mast->orig_r, mast->orig_r->offset, @@ -3454,7 +3454,6 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); - MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning @@ -3494,25 +3493,23 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) r_mas.last = r_mas.index = mas->last; /* Set up left side. */ - l_mas = *mas; - mas_wr_walk_index(&l_wr_mas); + mas_wr_walk_index(wr_mas); if (!wr_mas->entry) { - mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); - mas->offset = l_mas.offset; - mas->index = l_mas.index; - mas->last = l_mas.last = r_mas.last; + mas_extend_spanning_null(wr_mas, &r_wr_mas); + mas->last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ - if (!l_mas.index && r_mas.last == ULONG_MAX) { + if (!mas->index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } + l_mas = *mas; mast.orig_l = &l_mas; mast.orig_r = &r_mas; - mas_wr_spanning_rebalance(mas, &mast, height + 1, &l_wr_mas); + mas_wr_spanning_rebalance(mas, &mast, height + 1, wr_mas); } /* From 368015952767753abdf13e62b31e58b8107fc916 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:13 -0500 Subject: [PATCH 015/369] maple_tree: don't pass through height in mas_wr_spanning_store Height is not used locally in the function, so call the height argument closer to where it is passed in the next level. Link: https://lkml.kernel.org/r/20260130205935.2559335-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index afa39bbd687c..91d3fb7ac39c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2750,10 +2750,10 @@ static void mas_spanning_rebalance(struct ma_state *mas, static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, - struct maple_subtree_state *mast, unsigned char height, - struct ma_wr_state *wr_mas) + struct maple_subtree_state *mast, struct ma_wr_state *wr_mas) { struct maple_big_node b_node; + unsigned char height; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); @@ -2788,6 +2788,7 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); + height = mas_mt_height(mas) + 1; mas_spanning_rebalance_loop(mas, mast, height); } /* @@ -3448,7 +3449,6 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct ma_state *mas; - unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); @@ -3476,7 +3476,6 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ - height = mas_mt_height(mas); /* * Set up right side. Need to get to the next offset after the spanning @@ -3509,7 +3508,7 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) l_mas = *mas; mast.orig_l = &l_mas; mast.orig_r = &r_mas; - mas_wr_spanning_rebalance(mas, &mast, height + 1, wr_mas); + mas_wr_spanning_rebalance(mas, &mast, wr_mas); } /* From 2fce1c3c47caa73ceb0c97d2a3c4c9f470ad736c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:14 -0500 Subject: [PATCH 016/369] maple_tree: move maple_subtree_state from mas_wr_spanning_store to mas_wr_spanning_rebalance Moving the maple_subtree_state is necessary for future cleanups and is only set up in mas_wr_spanning_rebalance() but never used. Link: https://lkml.kernel.org/r/20260130205935.2559335-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 91d3fb7ac39c..c5bb341da5e9 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2750,46 +2750,52 @@ static void mas_spanning_rebalance(struct ma_state *mas, static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, - struct maple_subtree_state *mast, struct ma_wr_state *wr_mas) + struct ma_wr_state *wr_mas, struct ma_wr_state *r_wr_mas) { + struct maple_subtree_state mast; struct maple_big_node b_node; unsigned char height; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); + MA_STATE(mast_l_mas, NULL, 0, 0); + + mast_l_mas = *mas; + mast.orig_l = &mast_l_mas; + mast.orig_r = r_wr_mas->mas; memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ - mas_store_b_node(wr_mas, &b_node, mast->orig_l->end); + mas_store_b_node(wr_mas, &b_node, mast.orig_l->end); /* Copy r_mas into b_node if there is anything to copy. */ - if (mast->orig_r->max > mast->orig_r->last) - mas_mab_cp(mast->orig_r, mast->orig_r->offset, - mast->orig_r->end, &b_node, b_node.b_end + 1); + if (mast.orig_r->max > mast.orig_r->last) + mas_mab_cp(mast.orig_r, mast.orig_r->offset, + mast.orig_r->end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ - mast->orig_l->last = mas->index; + mast.orig_l->last = mas->index; - mast->bn = &b_node; + mast.bn = &b_node; /* Combine l_mas and r_mas and split them up evenly again. */ /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ - mast->l = &l_mas; - mast->m = &m_mas; - mast->r = &r_mas; + mast.l = &l_mas; + mast.m = &m_mas; + mast.r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ - if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && - unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) - mast_spanning_rebalance(mast); + if (((mast.orig_l->min != 0) || (mast.orig_r->max != ULONG_MAX)) && + unlikely(mast.bn->b_end <= mt_min_slots[mast.bn->type])) + mast_spanning_rebalance(&mast); height = mas_mt_height(mas) + 1; - mas_spanning_rebalance_loop(mas, mast, height); + mas_spanning_rebalance_loop(mas, &mast, height); } /* * mas_rebalance() - Rebalance a given node. @@ -3447,11 +3453,9 @@ done: */ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { - struct maple_subtree_state mast; struct ma_state *mas; /* Left and Right side of spanning store */ - MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); @@ -3505,10 +3509,7 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) return mas_new_root(mas, wr_mas->entry); } - l_mas = *mas; - mast.orig_l = &l_mas; - mast.orig_r = &r_mas; - mas_wr_spanning_rebalance(mas, &mast, wr_mas); + mas_wr_spanning_rebalance(mas, wr_mas, &r_wr_mas); } /* From 6b74d44b6297de89284cec8a83fce223106c159a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:15 -0500 Subject: [PATCH 017/369] maple_tree: correct right ma_wr_state end pivot in mas_wr_spanning_store() The end_piv will be needed in the next patch set and has not been set correctly in this code path. Correct the oversight before using it. Link: https://lkml.kernel.org/r/20260130205935.2559335-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c5bb341da5e9..caac936bd8d4 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3494,6 +3494,7 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; + r_wr_mas.end_piv = r_wr_mas.r_max; /* Set up left side. */ mas_wr_walk_index(wr_mas); From 9ec1e972c3de3106140c18d2a1c7c74795d85a69 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:16 -0500 Subject: [PATCH 018/369] maple_tree: introduce maple_copy node and use it in mas_spanning_rebalance() Introduce an internal-memory only node type called maple_copy to facilitate internal copy operations. Use it in mas_spanning_rebalance() for just the leaf nodes. Initially, the maple_copy node is used to configure the source nodes and copy the data into the big_node. The maple_copy contains a list of source entries with start and end offsets. One of the maple_copy entries can be itself with an offset of 0 to 2, representing the data where the store partially overwrites entries, or fully overwrites the entry. The side effect is that the source nodes no longer have to worry about partially copying the existing offset if it is not fully overwritten. This is in preparation of removal of the maple big_node, but for the time being the data is copied to the big node to limit the change size. Link: https://lkml.kernel.org/r/20260130205935.2559335-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 26 +++++++ lib/maple_tree.c | 140 ++++++++++++++++++++++++++++++++++--- 2 files changed, 157 insertions(+), 9 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 7b8aad47121e..9bc7fa89bc2e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -139,6 +139,7 @@ enum maple_type { maple_leaf_64, maple_range_64, maple_arange_64, + maple_copy, }; enum store_type { @@ -154,6 +155,30 @@ enum store_type { wr_slot_store, }; +struct maple_copy { + struct { + struct maple_node *node; + unsigned long max; + unsigned char start; + unsigned char end; + enum maple_type mt; + } src[4]; + /* Simulated node */ + void __rcu *slot[3]; + unsigned long min; + union { + unsigned long pivot[3]; + struct { + void *_pad[2]; + unsigned long max; + }; + }; + unsigned char end; + + /*Avoid passing these around */ + unsigned char s_count; +}; + /** * DOC: Maple tree flags * @@ -299,6 +324,7 @@ struct maple_node { }; struct maple_range_64 mr64; struct maple_arange_64 ma64; + struct maple_copy cp; }; }; diff --git a/lib/maple_tree.c b/lib/maple_tree.c index caac936bd8d4..554fdffd6c5b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -605,6 +605,8 @@ static inline unsigned long *ma_pivots(struct maple_node *node, case maple_range_64: case maple_leaf_64: return node->mr64.pivot; + case maple_copy: + return node->cp.pivot; case maple_dense: return NULL; } @@ -624,6 +626,7 @@ static inline unsigned long *ma_gaps(struct maple_node *node, switch (type) { case maple_arange_64: return node->ma64.gap; + case maple_copy: case maple_range_64: case maple_leaf_64: case maple_dense: @@ -690,6 +693,7 @@ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, case maple_arange_64: node->ma64.pivot[piv] = val; break; + case maple_copy: case maple_dense: break; } @@ -711,6 +715,8 @@ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) case maple_range_64: case maple_leaf_64: return mn->mr64.slot; + case maple_copy: + return mn->cp.slot; case maple_dense: return mn->slot; } @@ -2595,6 +2601,110 @@ dead_node: return NULL; } +/* + * cp_leaf_init() - Initialize a maple_copy node for the leaf level of a + * spanning store + * @cp: The maple copy node + * @mas: The maple state + * @l_wr_mas: The left write state of the spanning store + * @r_wr_mas: The right write state of the spanning store + */ +static inline void cp_leaf_init(struct maple_copy *cp, + struct ma_state *mas, struct ma_wr_state *l_wr_mas, + struct ma_wr_state *r_wr_mas) +{ + unsigned char end = 0; + + /* + * WARNING: The use of RCU_INIT_POINTER() makes it extremely important + * to not expose the maple_copy node to any readers. Exposure may + * result in buggy code when a compiler reorders the instructions. + */ + + /* Create entries to insert including split entries to left and right */ + if (l_wr_mas->r_min < mas->index) { + end++; + RCU_INIT_POINTER(cp->slot[0], l_wr_mas->content); + cp->pivot[0] = mas->index - 1; + } + RCU_INIT_POINTER(cp->slot[end], l_wr_mas->entry); + cp->pivot[end] = mas->last; + + if (r_wr_mas->end_piv > mas->last) { + end++; + RCU_INIT_POINTER(cp->slot[end], + r_wr_mas->slots[r_wr_mas->offset_end]); + cp->pivot[end] = r_wr_mas->end_piv; + } + + cp->min = l_wr_mas->r_min; + cp->max = cp->pivot[end]; + cp->end = end; +} + +static inline void append_wr_mas_cp(struct maple_copy *cp, + struct ma_wr_state *wr_mas, unsigned char start, unsigned char end) +{ + unsigned char count; + + count = cp->s_count; + cp->src[count].node = wr_mas->node; + cp->src[count].mt = wr_mas->type; + if (wr_mas->mas->end <= end) + cp->src[count].max = wr_mas->mas->max; + else + cp->src[count].max = wr_mas->pivots[end]; + + cp->src[count].start = start; + cp->src[count].end = end; + cp->s_count++; +} + +static inline void init_cp_src(struct maple_copy *cp) +{ + cp->src[cp->s_count].node = ma_mnode_ptr(cp); + cp->src[cp->s_count].mt = maple_copy; + cp->src[cp->s_count].max = cp->max; + cp->src[cp->s_count].start = 0; + cp->src[cp->s_count].end = cp->end; + cp->s_count++; +} + +static inline +void cp_data_write(struct maple_copy *cp, struct maple_big_node *b_node) +{ + struct maple_node *src; + unsigned char s; + unsigned char src_end, s_offset; + unsigned long *b_pivots, *cp_pivots; + void __rcu **b_slots, **cp_slots; + enum maple_type s_mt; + + b_node->b_end = 0; + + s = 0; + b_pivots = b_node->pivot; + b_slots = (void __rcu **)b_node->slot; + do { + unsigned char size; + + src = cp->src[s].node; + s_mt = cp->src[s].mt; + s_offset = cp->src[s].start; + src_end = cp->src[s].end; + size = src_end - s_offset + 1; + cp_pivots = ma_pivots(src, s_mt) + s_offset; + cp_slots = ma_slots(src, s_mt) + s_offset; + memcpy(b_slots, cp_slots, size * sizeof(void __rcu *)); + if (size > 1) + memcpy(b_pivots, cp_pivots, (size - 1) * sizeof(unsigned long)); + b_pivots[size - 1] = cp->src[s].max; + b_pivots += size; + b_slots += size; + b_node->b_end += size; + } while (++s < cp->s_count); +} + static void mas_spanning_rebalance_loop(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { @@ -2750,10 +2860,11 @@ static void mas_spanning_rebalance(struct ma_state *mas, static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, - struct ma_wr_state *wr_mas, struct ma_wr_state *r_wr_mas) + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; + struct maple_copy cp; unsigned char height; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); @@ -2765,15 +2876,26 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, mast.orig_l = &mast_l_mas; mast.orig_r = r_wr_mas->mas; memset(&b_node, 0, sizeof(struct maple_big_node)); - /* Copy l_mas and store the value in b_node. */ - mas_store_b_node(wr_mas, &b_node, mast.orig_l->end); - /* Copy r_mas into b_node if there is anything to copy. */ - if (mast.orig_r->max > mast.orig_r->last) - mas_mab_cp(mast.orig_r, mast.orig_r->offset, - mast.orig_r->end, &b_node, b_node.b_end + 1); - else - b_node.b_end++; + cp.s_count = 0; + cp_leaf_init(&cp, mas, l_wr_mas, r_wr_mas); + /* Copy left 0 - offset */ + if (l_wr_mas->mas->offset) { + unsigned char off = l_wr_mas->mas->offset - 1; + append_wr_mas_cp(&cp, l_wr_mas, 0, off); + cp.src[cp.s_count - 1].max = cp.min - 1; + } + + init_cp_src(&cp); + + /* Copy right from offset_end + 1 to end */ + if (r_wr_mas->mas->end != r_wr_mas->offset_end) + append_wr_mas_cp(&cp, r_wr_mas, r_wr_mas->offset_end + 1, + r_wr_mas->mas->end); + + + b_node.type = l_wr_mas->type; + cp_data_write(&cp, &b_node); /* Stop spanning searches by searching for just index. */ mast.orig_l->last = mas->index; From b14ffd2c6c7d95d140ae1e955cf2738b0fa71f17 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:17 -0500 Subject: [PATCH 019/369] maple_tree: testing update for spanning store Spanning store had some corner cases which showed up during rcu stress testing. Add explicit tests for those cases. At the same time add some locking for easier visibility of the rcu stress testing. Only a single dump of the tree will happen on the first detected issue instead of flooding the console with output. Link: https://lkml.kernel.org/r/20260130205935.2559335-13-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 172 +++++++++++++++++++++++++++++-- 1 file changed, 163 insertions(+), 9 deletions(-) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 5c1b18e3ed21..85fb5616c133 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -38,6 +38,7 @@ struct rcu_test_struct2 { unsigned long index[RCU_RANGE_COUNT]; unsigned long last[RCU_RANGE_COUNT]; + pthread_mutex_t dump; }; struct rcu_test_struct3 { @@ -33997,8 +33998,25 @@ static void *rcu_reader_fwd(void *ptr) } } - RCU_MT_BUG_ON(test, mas.index != r_start); - RCU_MT_BUG_ON(test, mas.last != r_end); + if (mas.index != r_start) { + if (pthread_mutex_trylock(&test->dump) != 0) { + rcu_read_unlock(); + goto quit; + } + printk("start is wrong: %lx (%lu) vs expected %lx (%lu)\n", + mas.index, mas.index, r_start, r_start); + RCU_MT_BUG_ON(test, mas.index != r_start); + } + + if (mas.last != r_end) { + if (pthread_mutex_trylock(&test->dump) != 0) { + rcu_read_unlock(); + goto quit; + } + printk("last is wrong: %lx (%lu) vs expected %lx (%lu)\n", + mas.last, mas.last, r_end, r_end); + RCU_MT_BUG_ON(test, mas.last != r_end); + } if (i == reader->flip) { alt = xa_mk_value(index + i + RCU_RANGE_COUNT); @@ -34014,7 +34032,8 @@ static void *rcu_reader_fwd(void *ptr) else if (entry == alt) toggled = true; else { - printk("!!%lu-%lu -> %p not %p or %p\n", mas.index, mas.last, entry, expected, alt); + printk("!!%lu-%lu -> %p not %p or %p\n", + mas.index, mas.last, entry, expected, alt); RCU_MT_BUG_ON(test, 1); } @@ -34047,9 +34066,11 @@ static void *rcu_reader_fwd(void *ptr) usleep(test->pause); } +quit: rcu_unregister_thread(); return NULL; } + /* RCU reader in decreasing index */ static void *rcu_reader_rev(void *ptr) { @@ -34119,13 +34140,17 @@ static void *rcu_reader_rev(void *ptr) line = __LINE__; if (mas.index != r_start) { + if (pthread_mutex_trylock(&test->dump) != 0) { + rcu_read_unlock(); + goto quit; + } + alt = xa_mk_value(index + i * 2 + 1 + RCU_RANGE_COUNT); mt_dump(test->mt, mt_dump_dec); - printk("Error: %lu-%lu %p != %lu-%lu %p %p line %d i %d\n", - mas.index, mas.last, entry, - r_start, r_end, expected, alt, - line, i); + printk("Error: %p %lu-%lu %p != %lu-%lu %p %p line %d i %d\n", + mas.node, mas.index, mas.last, entry, + r_start, r_end, expected, alt, line, i); } RCU_MT_BUG_ON(test, mas.index != r_start); RCU_MT_BUG_ON(test, mas.last != r_end); @@ -34180,6 +34205,7 @@ static void *rcu_reader_rev(void *ptr) usleep(test->pause); } +quit: rcu_unregister_thread(); return NULL; } @@ -34329,6 +34355,7 @@ static void rcu_stress(struct maple_tree *mt, bool forward) test.seen_modified = 0; test.thread_count = 0; test.start = test.stop = false; + pthread_mutex_init(&test.dump, NULL); seed = time(NULL); srand(seed); for (i = 0; i < RCU_RANGE_COUNT; i++) { @@ -34414,6 +34441,7 @@ struct rcu_test_struct { unsigned long removed; /* The index of the removed entry */ unsigned long added; /* The index of the removed entry */ unsigned long toggle; /* The index of the removed entry */ + pthread_mutex_t dump; }; static inline @@ -34506,7 +34534,9 @@ static void *rcu_loop(void *ptr) /* Out of the interesting range */ if (mas.index < test->index || mas.index > test->last) { if (entry != expected) { - printk("%lx - %lx = %p not %p\n", + if (pthread_mutex_trylock(&test->dump) != 0) + break; + printk("\nERROR: %lx - %lx = %p not %p\n", mas.index, mas.last, entry, expected); } MT_BUG_ON(test->mt, entry != expected); @@ -34854,6 +34884,7 @@ static noinline void __init check_rcu_threaded(struct maple_tree *mt) vals.range_end = ULONG_MAX; vals.seen_entry2 = 0; vals.seen_entry3 = 0; + pthread_mutex_init(&vals.dump, NULL); run_check_rcu(mt, &vals); mtree_destroy(mt); @@ -35250,6 +35281,8 @@ static noinline void __init check_spanning_write(struct maple_tree *mt) { unsigned long i, max = 5000; MA_STATE(mas, mt, 1200, 2380); + struct maple_enode *enode; + struct maple_node *pnode; for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); @@ -35410,6 +35443,128 @@ static noinline void __init check_spanning_write(struct maple_tree *mt) mas_set_range(&mas, 76, 875); mas_store_gfp(&mas, NULL, GFP_KERNEL); mtree_unlock(mt); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + if (MAPLE_32BIT) + i = 49750; /* 0xC25B */ + else + i = 49835; /* 0xC2AB */ + + mtree_lock(mt); + /* Store a null across a boundary that ends in a null */ + mas_set(&mas, i); /* 0xC2AB */ + MT_BUG_ON(mt, mas_walk(&mas) == NULL); + MT_BUG_ON(mt, mas.end != mas.offset); + MT_BUG_ON(mt, mas_next_range(&mas, ULONG_MAX) != NULL); + mas_set_range(&mas, i, mas.last - 1); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mt_validate(mt); + + /* Store a null across a boundary that starts and ends in a null */ + mas_set(&mas, 49849); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + MT_BUG_ON(mt, mas.index != 49846); + mas_set(&mas, 49876); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + MT_BUG_ON(mt, mas.last != 49879); + mas_set_range(&mas, 49849, 49876); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + /* Results in 49846-49879: (nil) */ + MT_BUG_ON(mt, mas.index != 49846); + MT_BUG_ON(mt, mas.last != 49879); + mt_validate(mt); + + /* Store a null across a boundary that starts and ends next to nulls */ + mas_set(&mas, 49800); + MT_BUG_ON(mt, mas_walk(&mas) == NULL); + MT_BUG_ON(mt, mas.index != 49800); + mas_set(&mas, 49815); + MT_BUG_ON(mt, mas_walk(&mas) == NULL); + MT_BUG_ON(mt, mas.last != 49815); + mas_set_range(&mas, 49800, 49815); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + /* Results in 49846-49879: (nil) */ + MT_BUG_ON(mt, mas.index != 49796); + MT_BUG_ON(mt, mas.last != 49819); + mt_validate(mt); + + /* Store a value across a boundary that starts and ends in a null */ + mas_set(&mas, 49907); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + MT_BUG_ON(mt, mas.index != 49906); + mas_set(&mas, 49928); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + MT_BUG_ON(mt, mas.last != 49929); + mas_set_range(&mas, 49907, 49928); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + MT_BUG_ON(mt, mas.index != 49907); + MT_BUG_ON(mt, mas.last != 49928); + mt_validate(mt); + + /* Store a value across a node boundary that causes a 3 way split */ + + if (MAPLE_32BIT) + i = 49590; /* 0xc1b6 */ + else + i = 49670; /* 0xC206 */ + + mas_set(&mas, i); + MT_BUG_ON(mt, mas_walk(&mas) == NULL); + MT_BUG_ON(mt, mas.index != i); + MT_BUG_ON(mt, mas.end != mt_slot_count(mas.node) - 1); + enode = mas.node; + MT_BUG_ON(mt, mas_next_range(&mas, ULONG_MAX) != NULL); + MT_BUG_ON(mt, mas.index != i + 6); + MT_BUG_ON(mt, mas.end != mt_slot_count(mas.node) - 1); + MT_BUG_ON(mt, enode == mas.node); + mas_set_range(&mas, i + 2, i + 7); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + MT_BUG_ON(mt, mas.index != i + 2); + MT_BUG_ON(mt, mas.last != i + 7); + mt_validate(mt); + + /* 2 levels of basically the same testing */ + + if (MAPLE_32BIT) { + /* 32bit needs a bit more work to fill the nodes. + * The two parent nodes need to be filled (they have one space + * vacant) without causing a split at the store locations (or + * the siblings). + */ + i = 44426; + mas_set(&mas, i); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + i = 45126; + mas_set(&mas, i); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + i = 44790; + } else { + /* 48950 - 48955 => ptr, 48956 - 48959 => NULL */ + i = 48950; + + } + mas_set(&mas, i); + MT_BUG_ON(mt, mas_walk(&mas) == NULL); + MT_BUG_ON(mt, mas.index != i); + MT_BUG_ON(mt, mas.end != mt_slot_count(mas.node) - 1); + enode = mas.node; + pnode = mte_parent(enode); + MT_BUG_ON(mt, mas_next_range(&mas, ULONG_MAX) != NULL); + MT_BUG_ON(mt, mas.index != i + 6); + MT_BUG_ON(mt, mas.end != mt_slot_count(mas.node) - 1); + MT_BUG_ON(mt, enode == mas.node); + MT_BUG_ON(mt, pnode == mte_parent(mas.node)); + mas_set_range(&mas, i + 2, i + 8); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mt_validate(mt); + + mtree_unlock(mt); + mtree_destroy(mt); + rcu_barrier(); } /* End of spanning write testing */ @@ -36029,7 +36184,6 @@ static inline int check_vma_modification(struct maple_tree *mt) return 0; } - void farmer_tests(void) { struct maple_node *node; From f141d5664388db2ec155f61c9ce36b9b4dc307bc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:18 -0500 Subject: [PATCH 020/369] maple_tree: inline mas_spanning_rebalance_loop() into mas_wr_spanning_rebalance() Just copy the code and replace count with height. This is done to avoid affecting other code paths into mas_spanning_rebalance_loop() for the next change. No functional change intended. Link: https://lkml.kernel.org/r/20260130205935.2559335-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 554fdffd6c5b..a9b7e398c7db 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2862,6 +2862,13 @@ static void mas_spanning_rebalance(struct ma_state *mas, static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { + + unsigned char split, mid_split; + unsigned char slot = 0; + unsigned char new_height = 0; /* used if node is a new root */ + struct maple_enode *left = NULL, *middle = NULL, *right = NULL; + struct maple_enode *old_enode; + struct maple_subtree_state mast; struct maple_big_node b_node; struct maple_copy cp; @@ -2917,7 +2924,106 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, mast_spanning_rebalance(&mast); height = mas_mt_height(mas) + 1; - mas_spanning_rebalance_loop(mas, &mast, height); + + /* + * Each level of the tree is examined and balanced, pushing data to the left or + * right, or rebalancing against left or right nodes is employed to avoid + * rippling up the tree to limit the amount of churn. Once a new sub-section of + * the tree is created, there may be a mix of new and old nodes. The old nodes + * will have the incorrect parent pointers and currently be in two trees: the + * original tree and the partially new tree. To remedy the parent pointers in + * the old tree, the new data is swapped into the active tree and a walk down + * the tree is performed and the parent pointers are updated. + * See mas_topiary_replace() for more information. + */ + while (height--) { + mast.bn->b_end--; + mast.bn->type = mte_node_type(mast.orig_l->node); + split = mas_mab_to_node(mas, mast.bn, &left, &right, &middle, + &mid_split); + mast_set_split_parents(&mast, left, middle, right, split, + mid_split); + mast_cp_to_nodes(&mast, left, middle, right, split, mid_split); + new_height++; + + /* + * Copy data from next level in the tree to mast.bn from next + * iteration + */ + memset(mast.bn, 0, sizeof(struct maple_big_node)); + mast.bn->type = mte_node_type(left); + + /* Root already stored in l->node. */ + if (mas_is_root_limits(mast.l)) + goto new_root; + + mast_ascend(&mast); + mast_combine_cp_left(&mast); + mast.l->offset = mast.bn->b_end; + mab_set_b_end(mast.bn, mast.l, left); + mab_set_b_end(mast.bn, mast.m, middle); + mab_set_b_end(mast.bn, mast.r, right); + + /* Copy anything necessary out of the right node. */ + mast_combine_cp_right(&mast); + mast.orig_l->last = mast.orig_l->max; + + if (mast_sufficient(&mast)) { + if (mast_overflow(&mast)) + continue; + + if (mast.orig_l->node == mast.orig_r->node) { + /* + * The data in b_node should be stored in one + * node and in the tree + */ + slot = mast.l->offset; + break; + } + + continue; + } + + /* May be a new root stored in mast.bn */ + if (mas_is_root_limits(mast.orig_l)) + break; + + mast_spanning_rebalance(&mast); + + /* rebalancing from other nodes may require another loop. */ + if (!height) + height++; + } + + mast.l->node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), + mte_node_type(mast.orig_l->node)); + + mab_mas_cp(mast.bn, 0, mt_slots[mast.bn->type] - 1, mast.l, true); + new_height++; + mas_set_parent(mas, left, mast.l->node, slot); + if (middle) + mas_set_parent(mas, middle, mast.l->node, ++slot); + + if (right) + mas_set_parent(mas, right, mast.l->node, ++slot); + + if (mas_is_root_limits(mast.l)) { +new_root: + mas_mn(mast.l)->parent = ma_parent_ptr(mas_tree_parent(mas)); + while (!mte_is_root(mast.orig_l->node)) + mast_ascend(&mast); + } else { + mas_mn(mast.l)->parent = mas_mn(mast.orig_l)->parent; + } + + old_enode = mast.orig_l->node; + mas->depth = mast.l->depth; + mas->node = mast.l->node; + mas->min = mast.l->min; + mas->max = mast.l->max; + mas->offset = mast.l->offset; + mas_wmb_replace(mas, old_enode, new_height); + mtree_range_walk(mas); } /* * mas_rebalance() - Rebalance a given node. From 6953038cab845f3720ec8d83915f4f083861e195 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:19 -0500 Subject: [PATCH 021/369] maple_tree: change initial big node setup in mas_wr_spanning_rebalance() Instead of copying the data into the big node and finding out that the data may need to be moved or appended to, calculate the data space up front (in the maple copy node) and set up another source for the copy. The additional copy source is tracked in the maple state sib (short for sibling), and is put into the maple write states for future operations after the data is in the big node. To facilitate the newly moved node, some initial setup of the maple subtree state are relocated after the potential shift caused by the new way of rebalancing against a sibling. Link: https://lkml.kernel.org/r/20260130205935.2559335-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + lib/maple_tree.c | 175 ++++++++++++++++++++++++++++++++----- 2 files changed, 153 insertions(+), 23 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 9bc7fa89bc2e..e99e16ac1c6d 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -177,6 +177,7 @@ struct maple_copy { /*Avoid passing these around */ unsigned char s_count; + unsigned char data; }; /** diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a9b7e398c7db..0d6f810a4a1f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1304,6 +1304,18 @@ static inline unsigned char mas_data_end(struct ma_state *mas) return mt_pivots[type]; } +static inline +void wr_mas_setup(struct ma_wr_state *wr_mas, struct ma_state *mas) +{ + wr_mas->node = mas_mn(mas); + wr_mas->type = mte_node_type(mas->node); + wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); + wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); + wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, mas->offset); + wr_mas->r_max = mas_safe_pivot(mas, wr_mas->pivots, mas->offset, + wr_mas->type); +} + /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state @@ -2258,6 +2270,44 @@ static inline void mte_mid_split_check(struct maple_enode **l, *split = mid_split; } +static inline +void spanning_sib(struct ma_wr_state *l_wr_mas, + struct ma_wr_state *r_wr_mas, struct ma_state *nneighbour) +{ + struct ma_state l_tmp = *l_wr_mas->mas; + struct ma_state r_tmp = *r_wr_mas->mas; + unsigned char depth = 0; + + do { + mas_ascend(&r_tmp); + mas_ascend(&l_tmp); + depth++; + if (r_tmp.offset < mas_data_end(&r_tmp)) { + r_tmp.offset++; + mas_descend(&r_tmp); + r_tmp.offset = 0; + while (--depth) + mas_descend(&r_tmp); + + r_tmp.end = mas_data_end(&r_tmp); + *nneighbour = r_tmp; + return; + } else if (l_tmp.offset) { + l_tmp.offset--; + do { + mas_descend(&l_tmp); + l_tmp.offset = mas_data_end(&l_tmp); + } while (--depth); + + l_tmp.end = l_tmp.offset; + *nneighbour = l_tmp; + return; + } + } while (!mte_is_root(r_tmp.node)); + + WARN_ON_ONCE(1); +} + /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. @@ -2642,6 +2692,49 @@ static inline void cp_leaf_init(struct maple_copy *cp, cp->end = end; } +/* + * cp_data_calc() - Calculate the size of the data (1 indexed). + * @cp: The maple copy struct with the new data populated. + * @l_wr_mas: The maple write state containing the data to the left of the write + * @r_wr_mas: The maple write state containing the data to the right of the + * write + * + * cp->data is a size (not indexed by 0). + */ +static inline void cp_data_calc(struct maple_copy *cp, + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) +{ + + /* Add 1 every time for the 0th element */ + cp->data = l_wr_mas->mas->offset; + /* Add the new data and any partial overwrites */ + cp->data += cp->end + 1; + /* Data from right (offset + 1 to end), +1 for zero */ + cp->data += r_wr_mas->mas->end - r_wr_mas->offset_end; +} + +static inline void append_mas_cp(struct maple_copy *cp, + struct ma_state *mas, unsigned char start, unsigned char end) +{ + struct maple_node *node; + enum maple_type mt; + unsigned char count; + + count = cp->s_count; + node = mas_mn(mas); + mt = mte_node_type(mas->node); + cp->src[count].node = node; + cp->src[count].mt = mt; + if (mas->end <= end) + cp->src[count].max = mas->max; + else + cp->src[count].max = ma_pivots(node, mt)[end]; + + cp->src[count].start = start; + cp->src[count].end = end; + cp->s_count++; +} + static inline void append_wr_mas_cp(struct maple_copy *cp, struct ma_wr_state *wr_mas, unsigned char start, unsigned char end) { @@ -2670,6 +2763,42 @@ static inline void init_cp_src(struct maple_copy *cp) cp->s_count++; } +/* + * multi_src_setup() - Set the @cp node up with multiple sources to copy from. + * @cp: The maple copy node + * @l_wr_mas: The left write maple state + * @r_wr_mas: The right write maple state + * @sib: The sibling maple state + * + * Note: @sib->end == 0 indicates no sibling will be used. + */ +static inline +void multi_src_setup(struct maple_copy *cp, struct ma_wr_state *l_wr_mas, + struct ma_wr_state *r_wr_mas, struct ma_state *sib) +{ + cp->s_count = 0; + if (sib->end && sib->max < l_wr_mas->mas->min) + append_mas_cp(cp, sib, 0, sib->end); + + /* Copy left 0 - offset */ + if (l_wr_mas->mas->offset) { + unsigned char off = l_wr_mas->mas->offset - 1; + + append_wr_mas_cp(cp, l_wr_mas, 0, off); + cp->src[cp->s_count - 1].max = cp->min - 1; + } + + init_cp_src(cp); + + /* Copy right either from offset or offset + 1 pending on r_max */ + if (r_wr_mas->mas->end != r_wr_mas->offset_end) + append_wr_mas_cp(cp, r_wr_mas, r_wr_mas->offset_end + 1, + r_wr_mas->mas->end); + + if (sib->end && sib->min > r_wr_mas->mas->max) + append_mas_cp(cp, sib, 0, sib->end); +} + static inline void cp_data_write(struct maple_copy *cp, struct maple_big_node *b_node) { @@ -2873,36 +3002,42 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, struct maple_big_node b_node; struct maple_copy cp; unsigned char height; + struct ma_state sib; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); MA_STATE(mast_l_mas, NULL, 0, 0); - mast_l_mas = *mas; - mast.orig_l = &mast_l_mas; - mast.orig_r = r_wr_mas->mas; memset(&b_node, 0, sizeof(struct maple_big_node)); + mast_l_mas = *mas; cp.s_count = 0; cp_leaf_init(&cp, mas, l_wr_mas, r_wr_mas); - /* Copy left 0 - offset */ - if (l_wr_mas->mas->offset) { - unsigned char off = l_wr_mas->mas->offset - 1; - - append_wr_mas_cp(&cp, l_wr_mas, 0, off); - cp.src[cp.s_count - 1].max = cp.min - 1; + cp_data_calc(&cp, l_wr_mas, r_wr_mas); + if (((l_wr_mas->mas->min != 0) || (r_wr_mas->mas->max != ULONG_MAX)) && + (cp.data <= mt_min_slots[l_wr_mas->type])) { + spanning_sib(l_wr_mas, r_wr_mas, &sib); + cp.data += sib.end + 1; + } else { + sib.end = 0; } - init_cp_src(&cp); - - /* Copy right from offset_end + 1 to end */ - if (r_wr_mas->mas->end != r_wr_mas->offset_end) - append_wr_mas_cp(&cp, r_wr_mas, r_wr_mas->offset_end + 1, - r_wr_mas->mas->end); - - + multi_src_setup(&cp, l_wr_mas, r_wr_mas, &sib); b_node.type = l_wr_mas->type; cp_data_write(&cp, &b_node); + if (sib.end) { + if (sib.max < l_wr_mas->mas->min) { + *l_wr_mas->mas = sib; + wr_mas_setup(l_wr_mas, &sib); + mast_l_mas = sib; + } else { + *r_wr_mas->mas = sib; + wr_mas_setup(r_wr_mas, &sib); + } + } + + mast.orig_l = &mast_l_mas; + mast.orig_r = r_wr_mas->mas; /* Stop spanning searches by searching for just index. */ mast.orig_l->last = mas->index; @@ -2917,12 +3052,6 @@ static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, mast.m = &m_mas; mast.r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; - - /* Check if this is not root and has sufficient data. */ - if (((mast.orig_l->min != 0) || (mast.orig_r->max != ULONG_MAX)) && - unlikely(mast.bn->b_end <= mt_min_slots[mast.bn->type])) - mast_spanning_rebalance(&mast); - height = mas_mt_height(mas) + 1; /* From de7f3ed37c1a433b7e1d5514e9a86a6c354e83d7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:20 -0500 Subject: [PATCH 022/369] maple_tree: introduce ma_leaf_max_gap() This is the same as mas_leaf_max_gap(), but the information necessary is known without a maple state in future code. Adding this function now simplifies the review for a subsequent patch. Link: https://lkml.kernel.org/r/20260130205935.2559335-16-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0d6f810a4a1f..499cae720251 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1315,26 +1315,14 @@ void wr_mas_setup(struct ma_wr_state *wr_mas, struct ma_state *mas) wr_mas->r_max = mas_safe_pivot(mas, wr_mas->pivots, mas->offset, wr_mas->type); } - -/* - * mas_leaf_max_gap() - Returns the largest gap in a leaf node - * @mas: the maple state - * - * Return: The maximum gap in the leaf. - */ -static unsigned long mas_leaf_max_gap(struct ma_state *mas) +static inline unsigned long ma_leaf_max_gap(struct maple_node *mn, + enum maple_type mt, unsigned long min, unsigned long max, + unsigned long *pivots, void __rcu **slots) { - enum maple_type mt; unsigned long pstart, gap, max_gap; - struct maple_node *mn; - unsigned long *pivots; - void __rcu **slots; unsigned char i; unsigned char max_piv; - mt = mte_node_type(mas->node); - mn = mas_mn(mas); - slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; @@ -1356,26 +1344,25 @@ static unsigned long mas_leaf_max_gap(struct ma_state *mas) * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ - pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { - max_gap = pivots[0] - mas->min + 1; + max_gap = pivots[0] - min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ - max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; + max_piv = ma_data_end(mn, mt, pivots, max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ - if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { + if (unlikely(max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; - if (max_gap > pivots[max_piv] - mas->min) + if (max_gap > pivots[max_piv] - min) return max_gap; } @@ -1395,6 +1382,27 @@ static unsigned long mas_leaf_max_gap(struct ma_state *mas) return max_gap; } +/* + * mas_leaf_max_gap() - Returns the largest gap in a leaf node + * @mas: the maple state + * + * Return: The maximum gap in the leaf. + */ +static inline unsigned long mas_leaf_max_gap(struct ma_state *mas) +{ + enum maple_type mt; + struct maple_node *mn; + unsigned long *pivots; + void __rcu **slots; + + mn = mas_mn(mas); + mt = mte_node_type(mas->node); + slots = ma_slots(mn, mt); + pivots = ma_pivots(mn, mt); + + return ma_leaf_max_gap(mn, mt, mas->min, mas->max, pivots, slots); +} + /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node From 20b20162e1f3b7e60cf0e79116fb2f3bdef3dc5e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:21 -0500 Subject: [PATCH 023/369] maple_tree: add gap support, slot and pivot sizes for maple copy Add plumbing work for using maple copy as a normal node for a source of copy operations. This is needed later. Link: https://lkml.kernel.org/r/20260130205935.2559335-17-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + lib/maple_tree.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e99e16ac1c6d..db6a02788902 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -165,6 +165,7 @@ struct maple_copy { } src[4]; /* Simulated node */ void __rcu *slot[3]; + unsigned long gap[3]; unsigned long min; union { unsigned long pivot[3]; diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 499cae720251..9c701ee7412c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -101,6 +101,7 @@ static const unsigned long mt_max[] = { [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, + [maple_copy] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif @@ -110,6 +111,7 @@ static const unsigned char mt_slots[] = { [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, + [maple_copy] = 3, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] @@ -118,6 +120,7 @@ static const unsigned char mt_pivots[] = { [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, + [maple_copy] = 3, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] @@ -126,6 +129,7 @@ static const unsigned char mt_min_slots[] = { [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, + [maple_copy] = 1, /* Should never be used */ }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] @@ -627,6 +631,7 @@ static inline unsigned long *ma_gaps(struct maple_node *node, case maple_arange_64: return node->ma64.gap; case maple_copy: + return node->cp.gap; case maple_range_64: case maple_leaf_64: case maple_dense: From a9c6716e088a1d4badd4fa6797469506bb99ec8b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:22 -0500 Subject: [PATCH 024/369] maple_tree: start using maple copy node for destination Stop using the maple subtree state and big node in favour of using three destinations in the maple copy node. That is, expand the way leaves were handled to all levels of the tree and use the maple copy node to track the new nodes. Extract out the sibling init into the data calculation since this is where the insufficient data can be detected. The remainder of the sibling code to shift the next iteration is moved to the spanning_ascend() function, since it is not always needed. Next introduce the dst_setup() function which will decide how many nodes are needed to contain the data at this level. Using the destination count, populate the copy node's dst array with the new nodes and set d_count to the correct value. Note that this can be tricky in the case of a leaf node with exactly enough room because of the rule against NULLs at the end of leaves. Once the destinations are ready, copy the data by altering the cp_data_write() function to copy from the sources to the destinations directly. This eliminates the use of the big node in this code path. On node completion, node_finalise() will zero out the remaining area and set the metadata, if necessary. spanning_ascend() is used to decide if the operation is complete. It may create a new root, converge into one destination, or continue upwards by ascending the left and right write maple states. One test case setup needed to be tweaked so that the targeted node was surrounded by full nodes. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20260130205935.2559335-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 14 + lib/maple_tree.c | 624 ++++++++++++++++++++++--------- tools/testing/radix-tree/maple.c | 2 +- 3 files changed, 458 insertions(+), 182 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index db6a02788902..0c464eade1d6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -156,6 +156,17 @@ enum store_type { }; struct maple_copy { + /* + * min, max, and pivots are values + * start, end, split are indexes into arrays + * data is a size + */ + + struct { + struct maple_node *node; + unsigned long max; + enum maple_type mt; + } dst[3]; struct { struct maple_node *node; unsigned long max; @@ -178,7 +189,10 @@ struct maple_copy { /*Avoid passing these around */ unsigned char s_count; + unsigned char d_count; + unsigned char split; unsigned char data; + unsigned char height; }; /** diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9c701ee7412c..4d9e7f00f5c8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -353,6 +353,13 @@ static inline struct maple_enode *mt_mk_node(const struct maple_node *node, (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } +static inline void ma_init_slot(void __rcu **slot, const struct maple_node *mn, + const enum maple_type mt) +{ + /* WARNING: this is unsafe if the slot is exposed to readers. */ + RCU_INIT_POINTER(*slot, (void *)mt_mk_node(mn, mt)); +} + static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); @@ -1320,6 +1327,21 @@ void wr_mas_setup(struct ma_wr_state *wr_mas, struct ma_state *mas) wr_mas->r_max = mas_safe_pivot(mas, wr_mas->pivots, mas->offset, wr_mas->type); } + +static inline +void wr_mas_ascend(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + mas_ascend(mas); + wr_mas_setup(wr_mas, mas); + mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, + mas->max); + /* Careful, this may be wrong.. */ + wr_mas->end_piv = wr_mas->r_max; + wr_mas->offset_end = mas->offset; +} + static inline unsigned long ma_leaf_max_gap(struct maple_node *mn, enum maple_type mt, unsigned long min, unsigned long max, unsigned long *pivots, void __rcu **slots) @@ -2507,6 +2529,112 @@ static inline void mas_wmb_replace(struct ma_state *mas, mas_update_gap(mas); } +/* + * node_copy() - Copy from one node to another. + * + * @mas: The maple state + * @src: The source node + * @start: The offset into the src to start copying + * @size: The size to copy (non-zero) + * @s_max: The source node max + * @s_mt: The source maple node type + * @dst: The destination + * @d_start: The start location in the destination node + * @d_mt: The destination maple node type + */ +static inline +unsigned long node_copy(struct ma_state *mas, struct maple_node *src, + unsigned char start, unsigned char size, unsigned long s_max, + enum maple_type s_mt, struct maple_node *dst, unsigned char d_start, + enum maple_type d_mt) +{ + unsigned long *s_pivots, *d_pivots; + void __rcu **s_slots, **d_slots; + unsigned long *s_gaps, *d_gaps; + unsigned long d_max; + + d_slots = ma_slots(dst, d_mt) + d_start; + d_pivots = ma_pivots(dst, d_mt) + d_start; + s_slots = ma_slots(src, s_mt) + start; + s_pivots = ma_pivots(src, s_mt) + start; + memcpy(d_slots, s_slots, size * sizeof(void __rcu *)); + if (!ma_is_leaf(d_mt) && s_mt == maple_copy) { + struct maple_enode *edst = mt_mk_node(dst, d_mt); + + + for (int i = 0; i < size; i++) + mas_set_parent(mas, + mt_slot_locked(mas->tree, d_slots, i), + edst, d_start + i); + } + + d_gaps = ma_gaps(dst, d_mt); + if (d_gaps) { + s_gaps = ma_gaps(src, s_mt) + start; + d_gaps += d_start; + memcpy(d_gaps, s_gaps, size * sizeof(unsigned long)); + } + + if (start + size - 1 < mt_pivots[s_mt]) + d_max = s_pivots[size - 1]; + else + d_max = s_max; + + if (d_start + size <= mt_pivots[d_mt]) + d_pivots[size - 1] = d_max; + + size--; + if (size) + memcpy(d_pivots, s_pivots, size * sizeof(unsigned long)); + + return d_max; +} + +/* + * node_finalise() - Zero out unused area and populate metadata + * @node: The maple node + * @mt: The maple node type + * @end: The end of the used area + */ +static inline +void node_finalise(struct maple_node *node, enum maple_type mt, + unsigned char end) +{ + unsigned char max_end = mt_slots[mt]; + unsigned char size; + unsigned long *gaps; + unsigned char gap_slot; + + gaps = ma_gaps(node, mt); + if (end < max_end - 1) { + size = max_end - end; + memset(ma_slots(node, mt) + end, 0, size * sizeof(void *)); + + if (gaps) + memset(gaps + end, 0, size * sizeof(unsigned long)); + + if (--size) + memset(ma_pivots(node, mt) + end, 0, size * sizeof(unsigned long)); + } + + gap_slot = 0; + if (gaps && !ma_is_leaf(mt)) { + unsigned long max_gap; + + max_gap = 0; + for (int i = 0; i <= end; i++) + if (gaps[i] > max_gap) { + gap_slot = i; + max_gap = gaps[i]; + } + } + + if (mt == maple_arange_64) + ma_set_meta(node, mt, gap_slot, end - 1); + else if (end <= max_end - 1) + ma_set_meta(node, mt, gap_slot, end - 1); +} + /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state @@ -2684,6 +2812,7 @@ static inline void cp_leaf_init(struct maple_copy *cp, * result in buggy code when a compiler reorders the instructions. */ + cp->height = 1; /* Create entries to insert including split entries to left and right */ if (l_wr_mas->r_min < mas->index) { end++; @@ -2726,6 +2855,100 @@ static inline void cp_data_calc(struct maple_copy *cp, cp->data += r_wr_mas->mas->end - r_wr_mas->offset_end; } +/* + * spanning_data() - Calculate the @cp data and populate @sib if insufficient + * @cp: The maple copy node + * @l_wr_mas: The left write maple state + * @r_wr_mas: The right write maple state + * @sib: The maple state of the sibling. + * + * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to + * indicate it will not be used. + */ +static inline void spanning_data(struct maple_copy *cp, + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas, + struct ma_state *sib) +{ + cp_data_calc(cp, l_wr_mas, r_wr_mas); + if (((l_wr_mas->mas->min != 0) || (r_wr_mas->mas->max != ULONG_MAX)) && + (cp->data <= mt_min_slots[l_wr_mas->type])) { + spanning_sib(l_wr_mas, r_wr_mas, sib); + cp->data += sib->end + 1; + } else { + sib->end = 0; + } +} + +/* + * dst_setup() - Set up one or more destinations for the new data. + * @cp: The maple copy node + * @mas: The maple state + * @mt: The source node type + */ +static inline +void dst_setup(struct maple_copy *cp, struct ma_state *mas, enum maple_type mt) +{ + /* Data is 1 indexed, every src has +1 added. */ + + if (cp->data <= mt_slots[mt]) { + cp->split = cp->data - 1; + cp->d_count = 1; + goto node_setup; + } + + cp->split = (cp->data - 1) / 2; + cp->d_count = 2; + if (cp->data < mt_slots[mt] * 2) + goto node_setup; + + if (cp->data == mt_slots[mt] * 2) { + unsigned char off; + unsigned char s; + + if (!ma_is_leaf(mt)) + goto node_setup; + + /* + * Leaf nodes are a bit tricky because we cannot assume the data + * can fit due to the NULL limitation on node ends. + */ + off = cp->split; + for (s = 0; s < cp->s_count; s++) { + unsigned char s_off; + + s_off = cp->src[s].end - cp->src[s].start; + if (s_off >= off) + break; + + s_off++; + off -= s_off; + } + + off += cp->src[s].start; + if (ma_slots(cp->src[s].node, cp->src[s].mt)[off]) + goto node_setup; + + cp->split++; + if (cp->split < mt_slots[mt]) + goto node_setup; + + cp->split -= 2; + if (cp->data - 2 - cp->split < mt_slots[mt]) + goto node_setup; + + } + + /* No other choice but to 3-way split the data */ + cp->split = (cp->data + 2) / 3; + cp->d_count = 3; + +node_setup: + for (int i = 0; i < cp->d_count; i++) { + cp->dst[i].mt = mt; + cp->dst[i].node = ma_mnode_ptr(mas_pop_node(mas)); + } +} + static inline void append_mas_cp(struct maple_copy *cp, struct ma_state *mas, unsigned char start, unsigned char end) { @@ -2813,38 +3036,153 @@ void multi_src_setup(struct maple_copy *cp, struct ma_wr_state *l_wr_mas, } static inline -void cp_data_write(struct maple_copy *cp, struct maple_big_node *b_node) +void cp_data_write(struct maple_copy *cp, struct ma_state *mas) { - struct maple_node *src; - unsigned char s; + struct maple_node *dst, *src; + unsigned char s, d; + unsigned char dst_offset; + unsigned char data_offset; unsigned char src_end, s_offset; - unsigned long *b_pivots, *cp_pivots; - void __rcu **b_slots, **cp_slots; - enum maple_type s_mt; + unsigned char split; + unsigned long s_max, d_max; + unsigned char dst_size; + enum maple_type s_mt, d_mt; - b_node->b_end = 0; - - s = 0; - b_pivots = b_node->pivot; - b_slots = (void __rcu **)b_node->slot; + data_offset = 0; + s = d = 0; + /* Readability help */ + src = cp->src[s].node; + dst = cp->dst[d].node; + s_offset = cp->src[s].start; + src_end = cp->src[s].end; + split = cp->split; + s_max = cp->src[s].max; + s_mt = cp->src[s].mt; + d_mt = cp->dst[d].mt; do { - unsigned char size; + dst_offset = 0; + d_max = 0; + dst = cp->dst[d].node; + d_mt = cp->dst[d].mt; + dst_size = split + 1; - src = cp->src[s].node; - s_mt = cp->src[s].mt; - s_offset = cp->src[s].start; - src_end = cp->src[s].end; - size = src_end - s_offset + 1; - cp_pivots = ma_pivots(src, s_mt) + s_offset; - cp_slots = ma_slots(src, s_mt) + s_offset; - memcpy(b_slots, cp_slots, size * sizeof(void __rcu *)); - if (size > 1) - memcpy(b_pivots, cp_pivots, (size - 1) * sizeof(unsigned long)); - b_pivots[size - 1] = cp->src[s].max; - b_pivots += size; - b_slots += size; - b_node->b_end += size; - } while (++s < cp->s_count); + while (dst_size) { + unsigned char size; + + if (src_end - s_offset + 1 < dst_size) + size = src_end - s_offset + 1; + else + size = dst_size; + + d_max = node_copy(mas, src, s_offset, size, s_max, s_mt, + dst, dst_offset, d_mt); + + dst_offset += size; + s_offset += size; + if (s_offset > src_end) { + /* This source is exhausted */ + s++; + if (s >= cp->s_count) { + cp->dst[d].max = d_max; + node_finalise(dst, d_mt, dst_offset); + return; + } + /* Reset local src */ + src = cp->src[s].node; + s_offset = cp->src[s].start; + src_end = cp->src[s].end; + s_max = cp->src[s].max; + s_mt = cp->src[s].mt; + } + + dst_size -= size; + data_offset += size; + } + + split = cp->split; + cp->dst[d].max = d_max; + /* Handle null entries */ + if (cp->dst[d].max != ULONG_MAX && + !ma_slots(dst, d_mt)[dst_offset - 1]) { + if (s_offset == cp->src[s].start) { + s--; + src = cp->src[s].node; + src_end = cp->src[s].end; + s_max = cp->src[s].max; + s_mt = cp->src[s].mt; + s_offset = src_end; + } else { + s_offset--; + } + /* Set dst max and clear pivot */ + split++; + data_offset--; + dst_offset--; + cp->dst[d].max = ma_pivots(dst, d_mt)[dst_offset - 1]; + } + + node_finalise(dst, d_mt, dst_offset); + ++d; /* Next destination */ + if (d == cp->d_count - 1) + split = cp->data - data_offset; + + if (d >= cp->d_count) { + WARN_ON(data_offset < cp->data); + return; + } + + } while (data_offset <= cp->data); +} + +/* + * cp_dst_to_slots() - Migrate the maple copy destination to the maple copy + * slots + * @cp: The maple copy node + * @min: The minimal value represented + * @max: The maximum value represented + * @mas: The maple state + */ +static inline void cp_dst_to_slots(struct maple_copy *cp, unsigned long min, + unsigned long max, struct ma_state *mas) +{ + unsigned char d; + unsigned long slot_min = min; + + for (d = 0; d < cp->d_count; d++) { + struct maple_node *mn = cp->dst[d].node; + enum maple_type mt = cp->dst[d].mt; + unsigned long slot_max = cp->dst[d].max; + + /* + * Warning, see cp_leaf_init() comment and rcu_assign_pointer() + * documentation. Since these are new nodes, there are no + * read-side operations that can view them until they are + * inserted into the tree after an rcu_assign_pointer() call. + */ + ma_init_slot(&cp->slot[d], mn, mt); + cp->pivot[d] = slot_max; + if (mt_is_alloc(mas->tree)) { + if (ma_is_leaf(mt)) { + cp->gap[d] = ma_leaf_max_gap(mn, mt, slot_min, + slot_max, ma_pivots(mn, mt), + ma_slots(mn, mt)); + } else { + unsigned long *gaps = ma_gaps(mn, mt); + + if (gaps) { + unsigned char gap_slot; + + gap_slot = ma_meta_gap(mn); + cp->gap[d] = gaps[gap_slot]; + } + } + } + slot_min = slot_max + 1; + } + + cp->end = cp->d_count - 1; + cp->min = min; + cp->max = max; } static void mas_spanning_rebalance_loop(struct ma_state *mas, @@ -3000,173 +3338,97 @@ static void mas_spanning_rebalance(struct ma_state *mas, mas_spanning_rebalance_loop(mas, mast, count); } +/* + * spanning_ascend() - See if a spanning store operation has to keep walking up + * the tree + * @cp: The maple_copy node + * @l_wr_mas: The left maple write state + * @r_wr_mas: The right maple write state + * @sib: the maple state of the sibling + * + * Returns: True if another iteration is necessary. + */ +static bool spanning_ascend(struct maple_copy *cp, struct ma_state *mas, + struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas, + struct ma_state *sib) +{ + if (sib->end) { + if (sib->max < l_wr_mas->mas->min) + *l_wr_mas->mas = *sib; + else + *r_wr_mas->mas = *sib; + } + + cp_dst_to_slots(cp, l_wr_mas->mas->min, r_wr_mas->mas->max, mas); + if (!cp->min && cp->max == ULONG_MAX) { + /* New root */ + if (cp->d_count != 1) { + enum maple_type mt = maple_arange_64; + + if (!mt_is_alloc(mas->tree)) + mt = maple_range_64; + + cp->data = cp->d_count; + cp->s_count = 0; + dst_setup(cp, mas, mt); + init_cp_src(cp); + node_copy(mas, cp->src[0].node, 0, cp->data, cp->max, maple_copy, + cp->dst[0].node, 0, mt); + node_finalise(cp->dst[0].node, mt, cp->end + 1); + /* + * Warning, see cp_leaf_init() comment and rcu_assign_pointer() + * documentation. Since this is a new root, there are no + * read-side operations that can view it until it is insert into + * the tree after an rcu_assign_pointer() call. + */ + ma_init_slot(&cp->slot[0], cp->dst[0].node, mt); + cp->height++; + } + WARN_ON_ONCE(cp->dst[0].node != mte_to_node( + mt_slot_locked(mas->tree, cp->slot, 0))); + cp->dst[0].node->parent = ma_parent_ptr(mas_tree_parent(mas)); + mas->min = 0; + mas->max = ULONG_MAX; + mas->depth = 0; + mas->node = mas_root_locked(mas); + return false; + } + + /* Converged and has a single destination */ + if ((cp->d_count == 1) && + (l_wr_mas->mas->node == r_wr_mas->mas->node)) { + cp->dst[0].node->parent = ma_parent_ptr(mas_mn(mas)->parent); + return false; + } + + cp->height++; + wr_mas_ascend(l_wr_mas); + wr_mas_ascend(r_wr_mas); + return true; +} static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { - unsigned char split, mid_split; - unsigned char slot = 0; - unsigned char new_height = 0; /* used if node is a new root */ - struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; - - struct maple_subtree_state mast; - struct maple_big_node b_node; struct maple_copy cp; - unsigned char height; struct ma_state sib; - MA_STATE(l_mas, mas->tree, mas->index, mas->index); - MA_STATE(r_mas, mas->tree, mas->index, mas->last); - MA_STATE(m_mas, mas->tree, mas->index, mas->index); - MA_STATE(mast_l_mas, NULL, 0, 0); - - memset(&b_node, 0, sizeof(struct maple_big_node)); - mast_l_mas = *mas; - cp.s_count = 0; cp_leaf_init(&cp, mas, l_wr_mas, r_wr_mas); - cp_data_calc(&cp, l_wr_mas, r_wr_mas); - if (((l_wr_mas->mas->min != 0) || (r_wr_mas->mas->max != ULONG_MAX)) && - (cp.data <= mt_min_slots[l_wr_mas->type])) { - spanning_sib(l_wr_mas, r_wr_mas, &sib); - cp.data += sib.end + 1; - } else { - sib.end = 0; - } + do { + spanning_data(&cp, l_wr_mas, r_wr_mas, &sib); + multi_src_setup(&cp, l_wr_mas, r_wr_mas, &sib); + dst_setup(&cp, mas, l_wr_mas->type); + cp_data_write(&cp, mas); + } while (spanning_ascend(&cp, mas, l_wr_mas, r_wr_mas, &sib)); - multi_src_setup(&cp, l_wr_mas, r_wr_mas, &sib); - b_node.type = l_wr_mas->type; - cp_data_write(&cp, &b_node); - if (sib.end) { - if (sib.max < l_wr_mas->mas->min) { - *l_wr_mas->mas = sib; - wr_mas_setup(l_wr_mas, &sib); - mast_l_mas = sib; - } else { - *r_wr_mas->mas = sib; - wr_mas_setup(r_wr_mas, &sib); - } - } - - mast.orig_l = &mast_l_mas; - mast.orig_r = r_wr_mas->mas; - /* Stop spanning searches by searching for just index. */ - mast.orig_l->last = mas->index; - - mast.bn = &b_node; - /* Combine l_mas and r_mas and split them up evenly again. */ - - /* - * The tree needs to be rebalanced and leaves need to be kept at the same level. - * Rebalancing is done by use of the ``struct maple_topiary``. - */ - mast.l = &l_mas; - mast.m = &m_mas; - mast.r = &r_mas; - l_mas.status = r_mas.status = m_mas.status = ma_none; - height = mas_mt_height(mas) + 1; - - /* - * Each level of the tree is examined and balanced, pushing data to the left or - * right, or rebalancing against left or right nodes is employed to avoid - * rippling up the tree to limit the amount of churn. Once a new sub-section of - * the tree is created, there may be a mix of new and old nodes. The old nodes - * will have the incorrect parent pointers and currently be in two trees: the - * original tree and the partially new tree. To remedy the parent pointers in - * the old tree, the new data is swapped into the active tree and a walk down - * the tree is performed and the parent pointers are updated. - * See mas_topiary_replace() for more information. - */ - while (height--) { - mast.bn->b_end--; - mast.bn->type = mte_node_type(mast.orig_l->node); - split = mas_mab_to_node(mas, mast.bn, &left, &right, &middle, - &mid_split); - mast_set_split_parents(&mast, left, middle, right, split, - mid_split); - mast_cp_to_nodes(&mast, left, middle, right, split, mid_split); - new_height++; - - /* - * Copy data from next level in the tree to mast.bn from next - * iteration - */ - memset(mast.bn, 0, sizeof(struct maple_big_node)); - mast.bn->type = mte_node_type(left); - - /* Root already stored in l->node. */ - if (mas_is_root_limits(mast.l)) - goto new_root; - - mast_ascend(&mast); - mast_combine_cp_left(&mast); - mast.l->offset = mast.bn->b_end; - mab_set_b_end(mast.bn, mast.l, left); - mab_set_b_end(mast.bn, mast.m, middle); - mab_set_b_end(mast.bn, mast.r, right); - - /* Copy anything necessary out of the right node. */ - mast_combine_cp_right(&mast); - mast.orig_l->last = mast.orig_l->max; - - if (mast_sufficient(&mast)) { - if (mast_overflow(&mast)) - continue; - - if (mast.orig_l->node == mast.orig_r->node) { - /* - * The data in b_node should be stored in one - * node and in the tree - */ - slot = mast.l->offset; - break; - } - - continue; - } - - /* May be a new root stored in mast.bn */ - if (mas_is_root_limits(mast.orig_l)) - break; - - mast_spanning_rebalance(&mast); - - /* rebalancing from other nodes may require another loop. */ - if (!height) - height++; - } - - mast.l->node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), - mte_node_type(mast.orig_l->node)); - - mab_mas_cp(mast.bn, 0, mt_slots[mast.bn->type] - 1, mast.l, true); - new_height++; - mas_set_parent(mas, left, mast.l->node, slot); - if (middle) - mas_set_parent(mas, middle, mast.l->node, ++slot); - - if (right) - mas_set_parent(mas, right, mast.l->node, ++slot); - - if (mas_is_root_limits(mast.l)) { -new_root: - mas_mn(mast.l)->parent = ma_parent_ptr(mas_tree_parent(mas)); - while (!mte_is_root(mast.orig_l->node)) - mast_ascend(&mast); - } else { - mas_mn(mast.l)->parent = mas_mn(mast.orig_l)->parent; - } - - old_enode = mast.orig_l->node; - mas->depth = mast.l->depth; - mas->node = mast.l->node; - mas->min = mast.l->min; - mas->max = mast.l->max; - mas->offset = mast.l->offset; - mas_wmb_replace(mas, old_enode, new_height); + old_enode = mas->node; + mas->node = mt_slot_locked(mas->tree, cp.slot, 0); + mas_wmb_replace(mas, old_enode, cp.height); mtree_range_walk(mas); } + /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 85fb5616c133..dfd7099f0d8e 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35508,7 +35508,7 @@ static noinline void __init check_spanning_write(struct maple_tree *mt) /* Store a value across a node boundary that causes a 3 way split */ if (MAPLE_32BIT) - i = 49590; /* 0xc1b6 */ + i = 49430; /* 0xc116 */ else i = 49670; /* 0xC206 */ From 3578d61c1c4305ef4e24adda97266bc4d1ebc962 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:23 -0500 Subject: [PATCH 025/369] maple_tree: inline mas_wr_spanning_rebalance() Now that the spanning rebalance is small, fully inline it in mas_wr_spanning_store(). No functional change. Link: https://lkml.kernel.org/r/20260130205935.2559335-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4d9e7f00f5c8..22e929826402 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3407,28 +3407,6 @@ static bool spanning_ascend(struct maple_copy *cp, struct ma_state *mas, return true; } -static noinline void mas_wr_spanning_rebalance(struct ma_state *mas, - struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) -{ - - struct maple_enode *old_enode; - struct maple_copy cp; - struct ma_state sib; - - cp_leaf_init(&cp, mas, l_wr_mas, r_wr_mas); - do { - spanning_data(&cp, l_wr_mas, r_wr_mas, &sib); - multi_src_setup(&cp, l_wr_mas, r_wr_mas, &sib); - dst_setup(&cp, mas, l_wr_mas->type); - cp_data_write(&cp, mas); - } while (spanning_ascend(&cp, mas, l_wr_mas, r_wr_mas, &sib)); - - old_enode = mas->node; - mas->node = mt_slot_locked(mas->tree, cp.slot, 0); - mas_wmb_replace(mas, old_enode, cp.height); - mtree_range_walk(mas); -} - /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state @@ -4085,7 +4063,10 @@ done: */ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { + struct maple_enode *old_enode; + struct maple_copy cp; struct ma_state *mas; + struct ma_state sib; /* Left and Right side of spanning store */ MA_STATE(r_mas, NULL, 0, 0); @@ -4142,7 +4123,18 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) return mas_new_root(mas, wr_mas->entry); } - mas_wr_spanning_rebalance(mas, wr_mas, &r_wr_mas); + cp_leaf_init(&cp, mas, wr_mas, &r_wr_mas); + do { + spanning_data(&cp, wr_mas, &r_wr_mas, &sib); + multi_src_setup(&cp, wr_mas, &r_wr_mas, &sib); + dst_setup(&cp, mas, wr_mas->type); + cp_data_write(&cp, mas); + } while (spanning_ascend(&cp, mas, wr_mas, &r_wr_mas, &sib)); + + old_enode = mas->node; + mas->node = mt_slot_locked(mas->tree, cp.slot, 0); + mas_wmb_replace(mas, old_enode, cp.height); + mtree_range_walk(mas); } /* From 448ec8c0a424b89b22acb641481088af1e43f5c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:24 -0500 Subject: [PATCH 026/369] maple_tree: remove unnecessary return statements Functions do not need to state return at the end, unless skipping unwind. These can safely be dropped. Link: https://lkml.kernel.org/r/20260130205935.2559335-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 22e929826402..87bbc5492305 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3294,7 +3294,6 @@ new_root: mas->offset = mast->l->offset; mas_wmb_replace(mas, old_enode, new_height); mtree_range_walk(mas); - return; } /* @@ -3718,7 +3717,6 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) mas->node = l_mas.node; mas_wmb_replace(mas, old, height); mtree_range_walk(mas); - return; } /* @@ -3779,7 +3777,6 @@ static inline void mas_root_expand(struct ma_state *mas, void *entry) ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - return; } /* @@ -4051,8 +4048,6 @@ static inline void mas_new_root(struct ma_state *mas, void *entry) done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); - - return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed @@ -4215,7 +4210,6 @@ done: trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; - return; } /* @@ -4263,8 +4257,6 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) */ if (!wr_mas->entry || gap) mas_update_gap(mas); - - return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) @@ -4378,7 +4370,6 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas, mas->end = new_end; trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry); - return; } /* @@ -4437,8 +4428,6 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) case wr_invalid: MT_BUG_ON(mas->tree, 1); } - - return; } static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) From 62e9d349afce279944a22295515c18af335722c0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:25 -0500 Subject: [PATCH 027/369] maple_tree: separate wr_split_store and wr_rebalance store type code path The split and rebalance store types both go through the same function that uses the big node. Separate the code paths so that each can be updated independently. No functional change intended Link: https://lkml.kernel.org/r/20260130205935.2559335-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 87bbc5492305..2383c97f684b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3719,24 +3719,6 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) mtree_range_walk(mas); } -/* - * mas_commit_b_node() - Commit the big node into the tree. - * @wr_mas: The maple write state - * @b_node: The maple big node - */ -static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, - struct maple_big_node *b_node) -{ - enum store_type type = wr_mas->mas->store_type; - - WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); - - if (type == wr_rebalance) - return mas_rebalance(wr_mas->mas, b_node); - - return mas_split(wr_mas->mas, b_node); -} - /* * mas_root_expand() - Expand a root to a node * @mas: The maple state @@ -4373,19 +4355,34 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas, } /* - * mas_wr_bnode() - Slow path for a modification. + * mas_wr_split() - Expand one node into two * @wr_mas: The write maple state - * - * This is where split, rebalance end up. */ -static void mas_wr_bnode(struct ma_wr_state *wr_mas) +static noinline_for_kasan void mas_wr_split(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); - mas_commit_b_node(wr_mas, &b_node); + WARN_ON_ONCE(wr_mas->mas->store_type != wr_split_store); + return mas_split(wr_mas->mas, &b_node); +} + +/* + * mas_wr_rebalance() - Insufficient data in one node needs to either get data + * from a sibling or absorb a sibling all together. + * @wr_mas: The write maple state + */ +static noinline_for_kasan void mas_wr_rebalance(struct ma_wr_state *wr_mas) +{ + struct maple_big_node b_node; + + trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); + memset(&b_node, 0, sizeof(struct maple_big_node)); + mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); + WARN_ON_ONCE(wr_mas->mas->store_type != wr_rebalance); + return mas_rebalance(wr_mas->mas, &b_node); } /* @@ -4416,8 +4413,10 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) mas_wr_spanning_store(wr_mas); break; case wr_split_store: + mas_wr_split(wr_mas); + break; case wr_rebalance: - mas_wr_bnode(wr_mas); + mas_wr_rebalance(wr_mas); break; case wr_new_root: mas_new_root(mas, wr_mas->entry); From b00a1804e69297489a88d4cbdffba6a6f21795e3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:26 -0500 Subject: [PATCH 028/369] maple_tree: add cp_is_new_root() helper Add a helper to do what is needed when the maple copy node contains a new root node. This is useful for future commits and is self-documenting code. [Liam.Howlett@oracle.com: remove warnings on older compilers] Link: https://lkml.kernel.org/r/malwmirqnpuxqkqrobcmzfkmmxipoyzwfs2nwc5fbpxlt2r2ej@wchmjtaljvw3 [akpm@linux-foundation.org: s/cp->slot[0]/&cp->slot[0]/, per Liam] Link: https://lkml.kernel.org/r/20260130205935.2559335-22-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 70 ++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 2383c97f684b..6a8f75dac6c9 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3337,6 +3337,43 @@ static void mas_spanning_rebalance(struct ma_state *mas, mas_spanning_rebalance_loop(mas, mast, count); } +static inline bool cp_is_new_root(struct maple_copy *cp, struct ma_state *mas) +{ + if (cp->min || cp->max != ULONG_MAX) + return false; + + if (cp->d_count != 1) { + enum maple_type mt = maple_arange_64; + + if (!mt_is_alloc(mas->tree)) + mt = maple_range_64; + + cp->data = cp->d_count; + cp->s_count = 0; + dst_setup(cp, mas, mt); + init_cp_src(cp); + node_copy(mas, cp->src[0].node, 0, cp->data, cp->max, maple_copy, + cp->dst[0].node, 0, mt); + node_finalise(cp->dst[0].node, mt, cp->end + 1); + /* + * Warning, see cp_leaf_init() comment and rcu_assign_pointer() + * documentation. Since this is a new root, there are no + * read-side operations that can view it until it is insert into + * the tree after an rcu_assign_pointer() call. + */ + ma_init_slot(&cp->slot[0], cp->dst[0].node, mt); + cp->height++; + } + WARN_ON_ONCE(cp->dst[0].node != mte_to_node( + mt_slot_locked(mas->tree, cp->slot, 0))); + cp->dst[0].node->parent = ma_parent_ptr(mas_tree_parent(mas)); + mas->min = 0; + mas->max = ULONG_MAX; + mas->depth = 0; + mas->node = mas_root_locked(mas); + return true; +} + /* * spanning_ascend() - See if a spanning store operation has to keep walking up * the tree @@ -3359,39 +3396,8 @@ static bool spanning_ascend(struct maple_copy *cp, struct ma_state *mas, } cp_dst_to_slots(cp, l_wr_mas->mas->min, r_wr_mas->mas->max, mas); - if (!cp->min && cp->max == ULONG_MAX) { - /* New root */ - if (cp->d_count != 1) { - enum maple_type mt = maple_arange_64; - - if (!mt_is_alloc(mas->tree)) - mt = maple_range_64; - - cp->data = cp->d_count; - cp->s_count = 0; - dst_setup(cp, mas, mt); - init_cp_src(cp); - node_copy(mas, cp->src[0].node, 0, cp->data, cp->max, maple_copy, - cp->dst[0].node, 0, mt); - node_finalise(cp->dst[0].node, mt, cp->end + 1); - /* - * Warning, see cp_leaf_init() comment and rcu_assign_pointer() - * documentation. Since this is a new root, there are no - * read-side operations that can view it until it is insert into - * the tree after an rcu_assign_pointer() call. - */ - ma_init_slot(&cp->slot[0], cp->dst[0].node, mt); - cp->height++; - } - WARN_ON_ONCE(cp->dst[0].node != mte_to_node( - mt_slot_locked(mas->tree, cp->slot, 0))); - cp->dst[0].node->parent = ma_parent_ptr(mas_tree_parent(mas)); - mas->min = 0; - mas->max = ULONG_MAX; - mas->depth = 0; - mas->node = mas_root_locked(mas); + if (cp_is_new_root(cp, mas)) return false; - } /* Converged and has a single destination */ if ((cp->d_count == 1) && From 971f0db15977582a811712bba042191721715c19 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:27 -0500 Subject: [PATCH 029/369] maple_tree: use maple copy node for mas_wr_rebalance() operation Stop using the maple big node for rebalance operations by changing to more align with spanning store. The rebalance operation needs its own data calculation in rebalance_data(). In the event of too much data, the rebalance tries to push the data using push_data_sib(). If there is insufficient data, the rebalance operation will rebalance against a sibling (found with rebalance_sib()). The rebalance starts at the leaf and works its way upward in the tree using rebalance_ascend(). Most of the code is shared with spanning store such as the copy node having a new root, but is fundamentally different in that the data must come from a sibling. A parent maple state is used to track the parent location to avoid multiple mas_ascend() calls. The maple state tree location is copied from the parent to the mas (child) in the ascend step. Ascending itself is done in the main loop. Link: https://lkml.kernel.org/r/20260130205935.2559335-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 213 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 206 insertions(+), 7 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 6a8f75dac6c9..7ce7c0a33943 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2305,6 +2305,19 @@ static inline void mte_mid_split_check(struct maple_enode **l, *split = mid_split; } +static inline void rebalance_sib(struct ma_state *parent, struct ma_state *sib) +{ + *sib = *parent; + /* Prioritize move right to pull data left */ + if (sib->offset < sib->end) + sib->offset++; + else + sib->offset--; + + mas_descend(sib); + sib->end = mas_data_end(sib); +} + static inline void spanning_sib(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas, struct ma_state *nneighbour) @@ -2855,6 +2868,112 @@ static inline void cp_data_calc(struct maple_copy *cp, cp->data += r_wr_mas->mas->end - r_wr_mas->offset_end; } +static bool data_fits(struct ma_state *sib, struct ma_state *mas, + struct maple_copy *cp) +{ + unsigned char new_data; + enum maple_type type; + unsigned char space; + unsigned char end; + + type = mte_node_type(mas->node); + space = 2 * mt_slots[type]; + end = sib->end; + + new_data = end + 1 + cp->data; + if (new_data > space) + return false; + + /* + * This is off by one by design. The extra space is left to reduce + * jitter in operations that add then remove two entries. + * + * end is an index while new space and data are both sizes. Adding one + * to end to convert the index to a size means that the below + * calculation should be <=, but we want to keep an extra space in nodes + * to reduce jitter. + * + * Note that it is still possible to get a full node on the left by the + * NULL landing exactly on the split. The NULL ending of a node happens + * in the dst_setup() function, where we will either increase the split + * by one or decrease it by one, if possible. In the case of split + * (this case), it is always possible to shift the spilt by one - again + * because there is at least one slot free by the below checking. + */ + if (new_data < space) + return true; + + return false; +} + +static inline void push_data_sib(struct maple_copy *cp, struct ma_state *mas, + struct ma_state *sib, struct ma_state *parent) +{ + + if (mte_is_root(mas->node)) + goto no_push; + + + *sib = *parent; + if (sib->offset) { + sib->offset--; + mas_descend(sib); + sib->end = mas_data_end(sib); + if (data_fits(sib, mas, cp)) /* Push left */ + return; + + *sib = *parent; + } + + if (sib->offset >= sib->end) + goto no_push; + + sib->offset++; + mas_descend(sib); + sib->end = mas_data_end(sib); + if (data_fits(sib, mas, cp)) /* Push right*/ + return; + +no_push: + sib->end = 0; +} + +/* + * rebalance_data() - Calculate the @cp data, populate @sib if insufficient or + * if the data can be pushed into a sibling. + * @cp: The maple copy node + * @wr_mas: The left write maple state + * @sib: The maple state of the sibling. + * + * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to + * indicate it will not be used. + * + */ +static inline void rebalance_data(struct maple_copy *cp, + struct ma_wr_state *wr_mas, struct ma_state *sib, + struct ma_state *parent) +{ + cp_data_calc(cp, wr_mas, wr_mas); + sib->end = 0; + if (cp->data > mt_slots[wr_mas->type]) { + push_data_sib(cp, wr_mas->mas, sib, parent); + if (sib->end) + goto use_sib; + } else if (cp->data <= mt_min_slots[wr_mas->type]) { + if ((wr_mas->mas->min != 0) || + (wr_mas->mas->max != ULONG_MAX)) { + rebalance_sib(parent, sib); + goto use_sib; + } + } + + return; + +use_sib: + + cp->data += sib->end + 1; +} + /* * spanning_data() - Calculate the @cp data and populate @sib if insufficient * @cp: The maple copy node @@ -3412,6 +3531,55 @@ static bool spanning_ascend(struct maple_copy *cp, struct ma_state *mas, return true; } +/* + * rebalance_ascend() - Ascend the tree and set up for the next loop - if + * necessary + * + * Return: True if there another rebalancing operation on the next level is + * needed, false otherwise. + */ +static inline bool rebalance_ascend(struct maple_copy *cp, + struct ma_wr_state *wr_mas, struct ma_state *sib, + struct ma_state *parent) +{ + struct ma_state *mas; + unsigned long min, max; + + mas = wr_mas->mas; + if (!sib->end) { + min = mas->min; + max = mas->max; + } else if (sib->min > mas->max) { /* Move right succeeded */ + min = mas->min; + max = sib->max; + wr_mas->offset_end = parent->offset + 1; + } else { + min = sib->min; + max = mas->max; + wr_mas->offset_end = parent->offset; + parent->offset--; + } + + cp_dst_to_slots(cp, min, max, mas); + if (cp_is_new_root(cp, mas)) + return false; + + if (cp->d_count == 1 && !sib->end) { + cp->dst[0].node->parent = ma_parent_ptr(mas_mn(mas)->parent); + return false; + } + + cp->height++; + mas->node = parent->node; + mas->offset = parent->offset; + mas->min = parent->min; + mas->max = parent->max; + mas->end = parent->end; + mas->depth = parent->depth; + wr_mas_setup(wr_mas, mas); + return true; +} + /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state @@ -4379,16 +4547,47 @@ static noinline_for_kasan void mas_wr_split(struct ma_wr_state *wr_mas) * mas_wr_rebalance() - Insufficient data in one node needs to either get data * from a sibling or absorb a sibling all together. * @wr_mas: The write maple state + * + * Rebalance is different than a spanning store in that the write state is + * already at the leaf node that's being altered. */ -static noinline_for_kasan void mas_wr_rebalance(struct ma_wr_state *wr_mas) +static void mas_wr_rebalance(struct ma_wr_state *wr_mas) { - struct maple_big_node b_node; + struct maple_enode *old_enode; + struct ma_state parent; + struct ma_state *mas; + struct maple_copy cp; + struct ma_state sib; - trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); - memset(&b_node, 0, sizeof(struct maple_big_node)); - mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); - WARN_ON_ONCE(wr_mas->mas->store_type != wr_rebalance); - return mas_rebalance(wr_mas->mas, &b_node); + /* + * Rebalancing occurs if a node is insufficient. Data is rebalanced + * against the node to the right if it exists, otherwise the node to the + * left of this node is rebalanced against this node. If rebalancing + * causes just one node to be produced instead of two, then the parent + * is also examined and rebalanced if it is insufficient. Every level + * tries to combine the data in the same way. If one node contains the + * entire range of the tree, then that node is used as a new root node. + */ + + mas = wr_mas->mas; + trace_ma_op(TP_FCT, mas); + parent = *mas; + cp_leaf_init(&cp, mas, wr_mas, wr_mas); + do { + if (!mte_is_root(parent.node)) { + mas_ascend(&parent); + parent.end = mas_data_end(&parent); + } + rebalance_data(&cp, wr_mas, &sib, &parent); + multi_src_setup(&cp, wr_mas, wr_mas, &sib); + dst_setup(&cp, mas, wr_mas->type); + cp_data_write(&cp, mas); + } while (rebalance_ascend(&cp, wr_mas, &sib, &parent)); + + old_enode = mas->node; + mas->node = mt_slot_locked(mas->tree, cp.slot, 0); + mas_wmb_replace(mas, old_enode, cp.height); + mtree_range_walk(mas); } /* From ebfee00c0bc8295b4dd8281f792f44d38b2af0c7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:28 -0500 Subject: [PATCH 030/369] maple_tree: add test for rebalance calculation off-by-one During the big node removal, an incorrect rebalance step went too far up the tree causing insufficient nodes. Test the faulty condition by recreating the scenario in the userspace testing. Link: https://lkml.kernel.org/r/20260130205935.2559335-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 125 +++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index dfd7099f0d8e..5ea45d67556a 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35888,6 +35888,127 @@ unlock: return ret; } +static noinline void __init check_erase_rebalance(struct maple_tree *mt) +{ + unsigned long val; + void *enode; + int ret; + + MA_STATE(mas, mt, 0, 0); + + /* + * During removal of big node, the rebalance started going too high, + * which resulted in too many nodes trying to be used. + * + * Create a rebalance which results in an exactly full parent (0-9) that + * does not need to be rebalanced. This required two full levels, + * followed by an insufficient level which will be rebalanced into two + * nodes, finally leaves that need to be rebalanced into one node. + * + * The bugs tree: + * root 4 Label R + * /\ /\ + * 9 X F + * /\ /\ / + * 9 X E + * /\ /\ /\ + * 4 8 C D + * /\ /\ + * 6 9 A B + * ^ becomes 5 with the write. + * + * Below, the reconstruction leaves the root with 2 entries, the setup + * uses the letter labels above. + */ + + ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, 4); + MT_BUG_ON(mt, ret); + + /* Cheap expansion to 5 levels */ + mtree_store(mt, ULONG_MAX, xa_mk_value(0), GFP_KERNEL); + /* rcu is used to ensure node use */ + mt_set_in_rcu(mt); + mas_lock(&mas); + + /* Node A had 6 entries */ + mas_walk(&mas); + MAS_BUG_ON(&mas, mas_data_end(&mas) < 6); + while (mas_data_end(&mas) > 6) { + mas_erase(&mas); + mas_next(&mas, ULONG_MAX); + } + + /* Move to Node B */ + enode = (void*) mas.node; + while (mas.node == enode) + mas_next(&mas, ULONG_MAX); + + /* Node B had 9 entries */ + MAS_BUG_ON(&mas, mas_data_end(&mas) < 9); + while (mas_data_end(&mas) > 9) { + mas_erase(&mas); + mas_next(&mas, ULONG_MAX); + } + + /* Move to Node C */ + mas_ascend(&mas); + val = mas.max; + /* Adjust entries to be 4 */ + while (mas_data_end(&mas) > 4) { + mas_set(&mas, val); + mas_erase(&mas); + mas_prev(&mas, 0); + val = mas.index; + mas_ascend(&mas); + } + + /* Move to Node D */ + mas_ascend(&mas); + mas.offset = 1; + mas_descend(&mas); + val = mas.max; + /* Adjust entries to be 8 */ + while (mas_data_end(&mas) < 8) { + mas_set(&mas, val--); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_ascend(&mas); + } + + /* Move to Node E */ + mas_ascend(&mas); + val = mas.max; + MAS_BUG_ON(&mas, mas_data_end(&mas) > 9); + /* Adjust Node E to 9 entries */ + while (mas_data_end(&mas) < 9) { + mas_set(&mas, val--); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_ascend(&mas); + mas_ascend(&mas); + } + + /* Move to Node F */ + mas_ascend(&mas); + val = mas.max; + MAS_BUG_ON(&mas, mas_data_end(&mas) > 9); + /* Adjust Node F to 9 entries */ + while (mas_data_end(&mas) < 9) { + mas_set(&mas, val--); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_ascend(&mas); + mas_ascend(&mas); + mas_ascend(&mas); + } + + /* Test is set up, walk to first entry */ + mas_set(&mas, 0); + mas_next(&mas, ULONG_MAX); + /* overwrite the entry to cause a rebalance, which was 1 too few */ + mas_set_range(&mas, 0, mas.last); + mas_preallocate(&mas, NULL, GFP_KERNEL); + mas_store_prealloc(&mas, NULL); + mas_unlock(&mas); +} + static noinline void __init check_mtree_dup(struct maple_tree *mt) { DEFINE_MTREE(new); @@ -36249,6 +36370,10 @@ void farmer_tests(void) check_mtree_dup(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_erase_rebalance(&tree); + mtree_destroy(&tree); + /* RCU testing */ mt_init_flags(&tree, 0); check_erase_testset(&tree); From 0abff2081983e5dfa7642ab1a794afc0a8417070 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:29 -0500 Subject: [PATCH 031/369] maple_tree: add copy_tree_location() helper Extract the copying of the tree location from one maple state to another into its own function. This is used more later. Link: https://lkml.kernel.org/r/20260130205935.2559335-25-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7ce7c0a33943..86072bfc2419 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3531,6 +3531,17 @@ static bool spanning_ascend(struct maple_copy *cp, struct ma_state *mas, return true; } +static inline +void copy_tree_location(const struct ma_state *src, struct ma_state *dst) +{ + dst->node = src->node; + dst->offset = src->offset; + dst->min = src->min; + dst->max = src->max; + dst->end = src->end; + dst->depth = src->depth; +} + /* * rebalance_ascend() - Ascend the tree and set up for the next loop - if * necessary @@ -3570,12 +3581,7 @@ static inline bool rebalance_ascend(struct maple_copy *cp, } cp->height++; - mas->node = parent->node; - mas->offset = parent->offset; - mas->min = parent->min; - mas->max = parent->max; - mas->end = parent->end; - mas->depth = parent->depth; + copy_tree_location(parent, mas); wr_mas_setup(wr_mas, mas); return true; } From 11e7f22f5e85058b09ca90e74002a3b82f50e940 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:30 -0500 Subject: [PATCH 032/369] maple_tree: add cp_converged() helper When the maple copy node converges into a single entry, then certain operations can stop ascending the tree. This is used more later. Link: https://lkml.kernel.org/r/20260130205935.2559335-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 86072bfc2419..3e0469d365e6 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3493,6 +3493,16 @@ static inline bool cp_is_new_root(struct maple_copy *cp, struct ma_state *mas) return true; } +static inline bool cp_converged(struct maple_copy *cp, struct ma_state *mas, + struct ma_state *sib) +{ + if (cp->d_count != 1 || sib->end) + return false; + + cp->dst[0].node->parent = ma_parent_ptr(mas_mn(mas)->parent); + return true; +} + /* * spanning_ascend() - See if a spanning store operation has to keep walking up * the tree @@ -3575,10 +3585,8 @@ static inline bool rebalance_ascend(struct maple_copy *cp, if (cp_is_new_root(cp, mas)) return false; - if (cp->d_count == 1 && !sib->end) { - cp->dst[0].node->parent = ma_parent_ptr(mas_mn(mas)->parent); + if (cp_converged(cp, mas, sib)) return false; - } cp->height++; copy_tree_location(parent, mas); From 280b792cac62ddadca2935766ca870b438c86323 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:31 -0500 Subject: [PATCH 033/369] maple_tree: use maple copy node for mas_wr_split() Instead of using the maple big node, use the maple copy node for reduced stack usage and aligning with mas_wr_rebalance() and mas_wr_spanning_store(). Splitting a node is similar to rebalancing, but a new evaluation of when to ascend is needed. The only other difference is that the data is pushed and never rebalanced at each level. The testing must also align with the changes to this commit to ensure the test suite continues to pass. Link: https://lkml.kernel.org/r/20260130205935.2559335-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 99 ++++++++++++++++++++++++++++++-- lib/test_maple_tree.c | 55 ++++++++++++++---- tools/testing/radix-tree/maple.c | 11 ++++ 3 files changed, 149 insertions(+), 16 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3e0469d365e6..68ea6c3c9260 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4542,19 +4542,106 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas, trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry); } +/* + * split_ascend() - See if a split operation has to keep walking up the tree + * @cp: The maple_copy node + * @wr_mas: The maple write state + * @sib: the maple state of the sibling + * + * Return: true if another split operation on the next level is needed, false + * otherwise + */ +static inline bool split_ascend(struct maple_copy *cp, + struct ma_wr_state *wr_mas, struct ma_state *sib, + struct ma_state *parent) +{ + struct ma_state *mas; + unsigned long min, max; + + mas = wr_mas->mas; + min = mas->min; /* push right, or normal split */ + max = mas->max; + wr_mas->offset_end = parent->offset; + if (sib->end) { + if (sib->max < mas->min) { + min = sib->min; /* push left */ + parent->offset--; + } else { + max = sib->max; /* push right */ + wr_mas->offset_end++; + } + } + + cp_dst_to_slots(cp, min, max, mas); + if (cp_is_new_root(cp, mas)) + return false; + + if (cp_converged(cp, mas, sib)) + return false; + + cp->height++; + copy_tree_location(parent, mas); + wr_mas_setup(wr_mas, mas); + return true; +} + +/* + * split_data() - Calculate the @cp data, populate @sib if the data can be + * pushed into a sibling. + * @cp: The maple copy node + * @wr_mas: The left write maple state + * @sib: The maple state of the sibling. + * + * Note: @cp->data is a size and not indexed by 0. @sib->end may be set to 0 to + * indicate it will not be used. + * + */ +static inline void split_data(struct maple_copy *cp, + struct ma_wr_state *wr_mas, struct ma_state *sib, + struct ma_state *parent) +{ + cp_data_calc(cp, wr_mas, wr_mas); + if (cp->data <= mt_slots[wr_mas->type]) { + sib->end = 0; + return; + } + + push_data_sib(cp, wr_mas->mas, sib, parent); + if (sib->end) + cp->data += sib->end + 1; +} + /* * mas_wr_split() - Expand one node into two * @wr_mas: The write maple state */ -static noinline_for_kasan void mas_wr_split(struct ma_wr_state *wr_mas) +static void mas_wr_split(struct ma_wr_state *wr_mas) { - struct maple_big_node b_node; + struct maple_enode *old_enode; + struct ma_state parent; + struct ma_state *mas; + struct maple_copy cp; + struct ma_state sib; + mas = wr_mas->mas; trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry); - memset(&b_node, 0, sizeof(struct maple_big_node)); - mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); - WARN_ON_ONCE(wr_mas->mas->store_type != wr_split_store); - return mas_split(wr_mas->mas, &b_node); + parent = *mas; + cp_leaf_init(&cp, mas, wr_mas, wr_mas); + do { + if (!mte_is_root(parent.node)) { + mas_ascend(&parent); + parent.end = mas_data_end(&parent); + } + split_data(&cp, wr_mas, &sib, &parent); + multi_src_setup(&cp, wr_mas, wr_mas, &sib); + dst_setup(&cp, mas, wr_mas->type); + cp_data_write(&cp, mas); + } while (split_ascend(&cp, wr_mas, &sib, &parent)); + + old_enode = mas->node; + mas->node = mt_slot_locked(mas->tree, cp.slot, 0); + mas_wmb_replace(mas, old_enode, cp.height); + mtree_range_walk(mas); } /* diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index a182e48b5f5e..434d8a2fdd99 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1024,6 +1024,7 @@ static noinline void __init check_ranges(struct maple_tree *mt) mt_set_non_kernel(10); check_store_range(mt, r[10], r[11], xa_mk_value(r[10]), 0); MT_BUG_ON(mt, !mt_height(mt)); + mt_validate(mt); mtree_destroy(mt); /* Create tree of 1-200 */ @@ -1031,11 +1032,13 @@ static noinline void __init check_ranges(struct maple_tree *mt) /* Store 45-168 */ check_store_range(mt, r[10], r[11], xa_mk_value(r[10]), 0); MT_BUG_ON(mt, !mt_height(mt)); + mt_validate(mt); mtree_destroy(mt); check_seq(mt, 30, false); check_store_range(mt, 6, 18, xa_mk_value(6), 0); MT_BUG_ON(mt, !mt_height(mt)); + mt_validate(mt); mtree_destroy(mt); /* Overwrite across multiple levels. */ @@ -1061,6 +1064,7 @@ static noinline void __init check_ranges(struct maple_tree *mt) check_load(mt, r[13] + 1, xa_mk_value(r[13] + 1)); check_load(mt, 135, NULL); check_load(mt, 140, NULL); + mt_validate(mt); mt_set_non_kernel(0); MT_BUG_ON(mt, !mt_height(mt)); mtree_destroy(mt); @@ -1285,14 +1289,20 @@ static noinline void __init check_ranges(struct maple_tree *mt) MT_BUG_ON(mt, mt_height(mt) >= 4); } /* Cause a 3 child split all the way up the tree. */ - for (i = 5; i < 215; i += 10) + for (i = 5; i < 215; i += 10) { check_store_range(mt, 11450 + i, 11450 + i + 1, NULL, 0); - for (i = 5; i < 65; i += 10) + mt_validate(mt); + } + for (i = 5; i < 65; i += 10) { check_store_range(mt, 11770 + i, 11770 + i + 1, NULL, 0); + mt_validate(mt); + } MT_BUG_ON(mt, mt_height(mt) >= 4); - for (i = 5; i < 45; i += 10) + for (i = 5; i < 45; i += 10) { check_store_range(mt, 11700 + i, 11700 + i + 1, NULL, 0); + mt_validate(mt); + } if (!MAPLE_32BIT) MT_BUG_ON(mt, mt_height(mt) < 4); mtree_destroy(mt); @@ -1304,17 +1314,42 @@ static noinline void __init check_ranges(struct maple_tree *mt) val2 = (i+1)*10; check_store_range(mt, val, val2, xa_mk_value(val), 0); MT_BUG_ON(mt, mt_height(mt) >= 4); + mt_validate(mt); } /* Fill parents and leaves before split. */ - for (i = 5; i < 455; i += 10) - check_store_range(mt, 7800 + i, 7800 + i + 1, NULL, 0); + val = 7660; + for (i = 5; i < 490; i += 5) { + val += 5; + check_store_range(mt, val, val + 1, NULL, 0); + mt_validate(mt); + MT_BUG_ON(mt, mt_height(mt) >= 4); + } - for (i = 1; i < 16; i++) - check_store_range(mt, 8185 + i, 8185 + i + 1, - xa_mk_value(8185+i), 0); - MT_BUG_ON(mt, mt_height(mt) >= 4); + val = 9460; + /* Fill parents and leaves before split. */ + for (i = 1; i < 10; i++) { + val++; + check_store_range(mt, val, val + 1, xa_mk_value(val), 0); + mt_validate(mt); + } + + val = 8000; + for (i = 1; i < 14; i++) { + val++; + check_store_range(mt, val, val + 1, xa_mk_value(val), 0); + mt_validate(mt); + } + + + check_store_range(mt, 8051, 8051, xa_mk_value(8081), 0); + check_store_range(mt, 8052, 8052, xa_mk_value(8082), 0); + check_store_range(mt, 8083, 8083, xa_mk_value(8083), 0); + check_store_range(mt, 8084, 8084, xa_mk_value(8084), 0); + check_store_range(mt, 8085, 8085, xa_mk_value(8085), 0); /* triple split across multiple levels. */ - check_store_range(mt, 8184, 8184, xa_mk_value(8184), 0); + check_store_range(mt, 8099, 8100, xa_mk_value(1), 0); + + mt_validate(mt); if (!MAPLE_32BIT) MT_BUG_ON(mt, mt_height(mt) != 4); } diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 5ea45d67556a..feedd5ab7058 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35406,7 +35406,18 @@ static noinline void __init check_spanning_write(struct maple_tree *mt) mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + mtree_lock(mt); + if (MAPLE_32BIT) { + i = 47811; + do { + mas_set(&mas, i); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + i++; + mas_ascend(&mas); + } while (mas_data_end(&mas) < mt_slot_count(mas.node) - 1); + } + mas_set(&mas, 47606); mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); mas_set(&mas, 47607); From b8852ef30c67318d40e69d295e28f6cab5174862 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:32 -0500 Subject: [PATCH 034/369] maple_tree: remove maple big node and subtree structs Now that no one uses the structures and functions, drop the dead code. Link: https://lkml.kernel.org/r/20260130205935.2559335-28-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1184 ---------------------------------------------- 1 file changed, 1184 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 68ea6c3c9260..51ff311ff5b6 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -133,45 +133,6 @@ static const unsigned char mt_min_slots[] = { }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] -#define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) -#define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) - -struct maple_big_node { - unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; - union { - struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; - struct { - unsigned long padding[MAPLE_BIG_NODE_GAPS]; - unsigned long gap[MAPLE_BIG_NODE_GAPS]; - }; - }; - unsigned char b_end; - enum maple_type type; -}; - -/* - * The maple_subtree_state is used to build a tree to replace a segment of an - * existing tree in a more atomic way. Any walkers of the older tree will hit a - * dead node and restart on updates. - */ -struct maple_subtree_state { - struct ma_state *orig_l; /* Original left side of subtree */ - struct ma_state *orig_r; /* Original right side of subtree */ - struct ma_state *l; /* New left side of subtree */ - struct ma_state *m; /* New middle of subtree (rare) */ - struct ma_state *r; /* New right side of subtree */ - struct ma_topiary *free; /* nodes to be freed */ - struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ - struct maple_big_node *bn; -}; - -#ifdef CONFIG_KASAN_STACK -/* Prevent mas_wr_bnode() from exceeding the stack frame limit */ -#define noinline_for_kasan noinline_for_stack -#else -#define noinline_for_kasan inline -#endif - /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { @@ -1669,169 +1630,6 @@ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) return false; } -/* - * mab_shift_right() - Shift the data in mab right. Note, does not clean out the - * old data or set b_node->b_end. - * @b_node: the maple_big_node - * @shift: the shift count - */ -static inline void mab_shift_right(struct maple_big_node *b_node, - unsigned char shift) -{ - unsigned long size = b_node->b_end * sizeof(unsigned long); - - memmove(b_node->pivot + shift, b_node->pivot, size); - memmove(b_node->slot + shift, b_node->slot, size); - if (b_node->type == maple_arange_64) - memmove(b_node->gap + shift, b_node->gap, size); -} - -/* - * mab_middle_node() - Check if a middle node is needed (unlikely) - * @b_node: the maple_big_node that contains the data. - * @split: the potential split location - * @slot_count: the size that can be stored in a single node being considered. - * - * Return: true if a middle node is required. - */ -static inline bool mab_middle_node(struct maple_big_node *b_node, int split, - unsigned char slot_count) -{ - unsigned char size = b_node->b_end; - - if (size >= 2 * slot_count) - return true; - - if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) - return true; - - return false; -} - -/* - * mab_no_null_split() - ensure the split doesn't fall on a NULL - * @b_node: the maple_big_node with the data - * @split: the suggested split location - * @slot_count: the number of slots in the node being considered. - * - * Return: the split location. - */ -static inline int mab_no_null_split(struct maple_big_node *b_node, - unsigned char split, unsigned char slot_count) -{ - if (!b_node->slot[split]) { - /* - * If the split is less than the max slot && the right side will - * still be sufficient, then increment the split on NULL. - */ - if ((split < slot_count - 1) && - (b_node->b_end - split) > (mt_min_slots[b_node->type])) - split++; - else - split--; - } - return split; -} - -/* - * mab_calc_split() - Calculate the split location and if there needs to be two - * splits. - * @mas: The maple state - * @bn: The maple_big_node with the data - * @mid_split: The second split, if required. 0 otherwise. - * - * Return: The first split location. The middle split is set in @mid_split. - */ -static inline int mab_calc_split(struct ma_state *mas, - struct maple_big_node *bn, unsigned char *mid_split) -{ - unsigned char b_end = bn->b_end; - int split = b_end / 2; /* Assume equal split. */ - unsigned char slot_count = mt_slots[bn->type]; - - /* - * To support gap tracking, all NULL entries are kept together and a node cannot - * end on a NULL entry, with the exception of the left-most leaf. The - * limitation means that the split of a node must be checked for this condition - * and be able to put more data in one direction or the other. - * - * Although extremely rare, it is possible to enter what is known as the 3-way - * split scenario. The 3-way split comes about by means of a store of a range - * that overwrites the end and beginning of two full nodes. The result is a set - * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can - * also be located in different parent nodes which are also full. This can - * carry upwards all the way to the root in the worst case. - */ - if (unlikely(mab_middle_node(bn, split, slot_count))) { - split = b_end / 3; - *mid_split = split * 2; - } else { - *mid_split = 0; - } - - /* Avoid ending a node on a NULL entry */ - split = mab_no_null_split(bn, split, slot_count); - - if (unlikely(*mid_split)) - *mid_split = mab_no_null_split(bn, *mid_split, slot_count); - - return split; -} - -/* - * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node - * and set @b_node->b_end to the next free slot. - * @mas: The maple state - * @mas_start: The starting slot to copy - * @mas_end: The end slot to copy (inclusively) - * @b_node: The maple_big_node to place the data - * @mab_start: The starting location in maple_big_node to store the data. - */ -static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, - unsigned char mas_end, struct maple_big_node *b_node, - unsigned char mab_start) -{ - enum maple_type mt; - struct maple_node *node; - void __rcu **slots; - unsigned long *pivots, *gaps; - int i = mas_start, j = mab_start; - unsigned char piv_end; - - node = mas_mn(mas); - mt = mte_node_type(mas->node); - pivots = ma_pivots(node, mt); - if (!i) { - b_node->pivot[j] = pivots[i++]; - if (unlikely(i > mas_end)) - goto complete; - j++; - } - - piv_end = min(mas_end, mt_pivots[mt]); - for (; i < piv_end; i++, j++) { - b_node->pivot[j] = pivots[i]; - if (unlikely(!b_node->pivot[j])) - goto complete; - - if (unlikely(mas->max == b_node->pivot[j])) - goto complete; - } - - b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); - -complete: - b_node->b_end = ++j; - j -= mab_start; - slots = ma_slots(node, mt); - memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); - if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { - gaps = ma_gaps(node, mt); - memcpy(b_node->gap + mab_start, gaps + mas_start, - sizeof(unsigned long) * j); - } -} - /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node @@ -1845,134 +1643,6 @@ static inline void mas_leaf_set_meta(struct maple_node *node, ma_set_meta(node, mt, 0, end); } -/* - * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. - * @b_node: the maple_big_node that has the data - * @mab_start: the start location in @b_node. - * @mab_end: The end location in @b_node (inclusively) - * @mas: The maple state with the maple encoded node. - */ -static inline void mab_mas_cp(struct maple_big_node *b_node, - unsigned char mab_start, unsigned char mab_end, - struct ma_state *mas, bool new_max) -{ - int i, j = 0; - enum maple_type mt = mte_node_type(mas->node); - struct maple_node *node = mte_to_node(mas->node); - void __rcu **slots = ma_slots(node, mt); - unsigned long *pivots = ma_pivots(node, mt); - unsigned long *gaps = NULL; - unsigned char end; - - if (mab_end - mab_start > mt_pivots[mt]) - mab_end--; - - if (!pivots[mt_pivots[mt] - 1]) - slots[mt_pivots[mt]] = NULL; - - i = mab_start; - do { - pivots[j++] = b_node->pivot[i++]; - } while (i <= mab_end && likely(b_node->pivot[i])); - - memcpy(slots, b_node->slot + mab_start, - sizeof(void *) * (i - mab_start)); - - if (new_max) - mas->max = b_node->pivot[i - 1]; - - end = j - 1; - if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { - unsigned long max_gap = 0; - unsigned char offset = 0; - - gaps = ma_gaps(node, mt); - do { - gaps[--j] = b_node->gap[--i]; - if (gaps[j] > max_gap) { - offset = j; - max_gap = gaps[j]; - } - } while (j); - - ma_set_meta(node, mt, offset, end); - } else { - mas_leaf_set_meta(node, mt, end); - } -} - -/* - * mas_store_b_node() - Store an @entry into the b_node while also copying the - * data from a maple encoded node. - * @wr_mas: the maple write state - * @b_node: the maple_big_node to fill with data - * @offset_end: the offset to end copying - * - * Return: The actual end of the data stored in @b_node - */ -static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, - struct maple_big_node *b_node, unsigned char offset_end) -{ - unsigned char slot; - unsigned char b_end; - /* Possible underflow of piv will wrap back to 0 before use. */ - unsigned long piv; - struct ma_state *mas = wr_mas->mas; - - b_node->type = wr_mas->type; - b_end = 0; - slot = mas->offset; - if (slot) { - /* Copy start data up to insert. */ - mas_mab_cp(mas, 0, slot - 1, b_node, 0); - b_end = b_node->b_end; - piv = b_node->pivot[b_end - 1]; - } else - piv = mas->min - 1; - - if (piv + 1 < mas->index) { - /* Handle range starting after old range */ - b_node->slot[b_end] = wr_mas->content; - if (!wr_mas->content) - b_node->gap[b_end] = mas->index - 1 - piv; - b_node->pivot[b_end++] = mas->index - 1; - } - - /* Store the new entry. */ - mas->offset = b_end; - b_node->slot[b_end] = wr_mas->entry; - b_node->pivot[b_end] = mas->last; - - /* Appended. */ - if (mas->last >= mas->max) - goto b_end; - - /* Handle new range ending before old range ends */ - piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); - if (piv > mas->last) { - if (offset_end != slot) - wr_mas->content = mas_slot_locked(mas, wr_mas->slots, - offset_end); - - b_node->slot[++b_end] = wr_mas->content; - if (!wr_mas->content) - b_node->gap[b_end] = piv - mas->last + 1; - b_node->pivot[b_end] = piv; - } - - slot = offset_end + 1; - if (slot > mas->end) - goto b_end; - - /* Copy end data to the end of the node. */ - mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); - b_node->b_end--; - return; - -b_end: - b_node->b_end = b_end; -} - /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state @@ -2017,25 +1687,6 @@ static inline bool mas_next_sibling(struct ma_state *mas) return true; } -/* - * mas_node_or_none() - Set the enode and state. - * @mas: the maple state - * @enode: The encoded maple node. - * - * Set the node to the enode and the status. - */ -static inline void mas_node_or_none(struct ma_state *mas, - struct maple_enode *enode) -{ - if (enode) { - mas->node = enode; - mas->status = ma_active; - } else { - mas->node = NULL; - mas->status = ma_none; - } -} - /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * If @mas->index cannot be found within the containing @@ -2069,242 +1720,6 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) wr_mas->offset_end = mas->offset = offset; } -/* - * mast_rebalance_next() - Rebalance against the next node - * @mast: The maple subtree state - */ -static inline void mast_rebalance_next(struct maple_subtree_state *mast) -{ - unsigned char b_end = mast->bn->b_end; - - mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), - mast->bn, b_end); - mast->orig_r->last = mast->orig_r->max; -} - -/* - * mast_rebalance_prev() - Rebalance against the previous node - * @mast: The maple subtree state - */ -static inline void mast_rebalance_prev(struct maple_subtree_state *mast) -{ - unsigned char end = mas_data_end(mast->orig_l) + 1; - unsigned char b_end = mast->bn->b_end; - - mab_shift_right(mast->bn, end); - mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); - mast->l->min = mast->orig_l->min; - mast->orig_l->index = mast->orig_l->min; - mast->bn->b_end = end + b_end; - mast->l->offset += end; -} - -/* - * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring - * the node to the right. Checking the nodes to the right then the left at each - * level upwards until root is reached. - * Data is copied into the @mast->bn. - * @mast: The maple_subtree_state. - */ -static inline -bool mast_spanning_rebalance(struct maple_subtree_state *mast) -{ - struct ma_state r_tmp = *mast->orig_r; - struct ma_state l_tmp = *mast->orig_l; - unsigned char depth = 0; - - do { - mas_ascend(mast->orig_r); - mas_ascend(mast->orig_l); - depth++; - if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { - mast->orig_r->offset++; - do { - mas_descend(mast->orig_r); - mast->orig_r->offset = 0; - } while (--depth); - - mast_rebalance_next(mast); - *mast->orig_l = l_tmp; - return true; - } else if (mast->orig_l->offset != 0) { - mast->orig_l->offset--; - do { - mas_descend(mast->orig_l); - mast->orig_l->offset = - mas_data_end(mast->orig_l); - } while (--depth); - - mast_rebalance_prev(mast); - *mast->orig_r = r_tmp; - return true; - } - } while (!mte_is_root(mast->orig_r->node)); - - *mast->orig_r = r_tmp; - *mast->orig_l = l_tmp; - return false; -} - -/* - * mast_ascend() - Ascend the original left and right maple states. - * @mast: the maple subtree state. - * - * Ascend the original left and right sides. Set the offsets to point to the - * data already in the new tree (@mast->l and @mast->r). - */ -static inline void mast_ascend(struct maple_subtree_state *mast) -{ - MA_WR_STATE(wr_mas, mast->orig_r, NULL); - mas_ascend(mast->orig_l); - mas_ascend(mast->orig_r); - - mast->orig_r->offset = 0; - mast->orig_r->index = mast->r->max; - /* last should be larger than or equal to index */ - if (mast->orig_r->last < mast->orig_r->index) - mast->orig_r->last = mast->orig_r->index; - - wr_mas.type = mte_node_type(mast->orig_r->node); - mas_wr_node_walk(&wr_mas); - /* Set up the left side of things */ - mast->orig_l->offset = 0; - mast->orig_l->index = mast->l->min; - wr_mas.mas = mast->orig_l; - wr_mas.type = mte_node_type(mast->orig_l->node); - mas_wr_node_walk(&wr_mas); - - mast->bn->type = wr_mas.type; -} - -/* - * mas_new_ma_node() - Create and return a new maple node. Helper function. - * @mas: the maple state with the allocations. - * @b_node: the maple_big_node with the type encoding. - * - * Use the node type from the maple_big_node to allocate a new node from the - * ma_state. This function exists mainly for code readability. - * - * Return: A new maple encoded node - */ -static inline struct maple_enode -*mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) -{ - return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); -} - -/* - * mas_mab_to_node() - Set up right and middle nodes - * - * @mas: the maple state that contains the allocations. - * @b_node: the node which contains the data. - * @left: The pointer which will have the left node - * @right: The pointer which may have the right node - * @middle: the pointer which may have the middle node (rare) - * @mid_split: the split location for the middle node - * - * Return: the split of left. - */ -static inline unsigned char mas_mab_to_node(struct ma_state *mas, - struct maple_big_node *b_node, struct maple_enode **left, - struct maple_enode **right, struct maple_enode **middle, - unsigned char *mid_split) -{ - unsigned char split = 0; - unsigned char slot_count = mt_slots[b_node->type]; - - *left = mas_new_ma_node(mas, b_node); - *right = NULL; - *middle = NULL; - *mid_split = 0; - - if (b_node->b_end < slot_count) { - split = b_node->b_end; - } else { - split = mab_calc_split(mas, b_node, mid_split); - *right = mas_new_ma_node(mas, b_node); - } - - if (*mid_split) - *middle = mas_new_ma_node(mas, b_node); - - return split; - -} - -/* - * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end - * pointer. - * @b_node: the big node to add the entry - * @mas: the maple state to get the pivot (mas->max) - * @entry: the entry to add, if NULL nothing happens. - */ -static inline void mab_set_b_end(struct maple_big_node *b_node, - struct ma_state *mas, - void *entry) -{ - if (!entry) - return; - - b_node->slot[b_node->b_end] = entry; - if (mt_is_alloc(mas->tree)) - b_node->gap[b_node->b_end] = mas_max_gap(mas); - b_node->pivot[b_node->b_end++] = mas->max; -} - -/* - * mas_set_split_parent() - combine_then_separate helper function. Sets the parent - * of @mas->node to either @left or @right, depending on @slot and @split - * - * @mas: the maple state with the node that needs a parent - * @left: possible parent 1 - * @right: possible parent 2 - * @slot: the slot the mas->node was placed - * @split: the split location between @left and @right - */ -static inline void mas_set_split_parent(struct ma_state *mas, - struct maple_enode *left, - struct maple_enode *right, - unsigned char *slot, unsigned char split) -{ - if (mas_is_none(mas)) - return; - - if ((*slot) <= split) - mas_set_parent(mas, mas->node, left, *slot); - else if (right) - mas_set_parent(mas, mas->node, right, (*slot) - split - 1); - - (*slot)++; -} - -/* - * mte_mid_split_check() - Check if the next node passes the mid-split - * @l: Pointer to left encoded maple node. - * @m: Pointer to middle encoded maple node. - * @r: Pointer to right encoded maple node. - * @slot: The offset - * @split: The split location. - * @mid_split: The middle split. - */ -static inline void mte_mid_split_check(struct maple_enode **l, - struct maple_enode **r, - struct maple_enode *right, - unsigned char slot, - unsigned char *split, - unsigned char mid_split) -{ - if (*r == right) - return; - - if (slot < mid_split) - return; - - *l = *r; - *r = right; - *split = mid_split; -} - static inline void rebalance_sib(struct ma_state *parent, struct ma_state *sib) { *sib = *parent; @@ -2356,43 +1771,6 @@ void spanning_sib(struct ma_wr_state *l_wr_mas, WARN_ON_ONCE(1); } -/* - * mast_set_split_parents() - Helper function to set three nodes parents. Slot - * is taken from @mast->l. - * @mast: the maple subtree state - * @left: the left node - * @right: the right node - * @split: the split location. - */ -static inline void mast_set_split_parents(struct maple_subtree_state *mast, - struct maple_enode *left, - struct maple_enode *middle, - struct maple_enode *right, - unsigned char split, - unsigned char mid_split) -{ - unsigned char slot; - struct maple_enode *l = left; - struct maple_enode *r = right; - - if (mas_is_none(mast->l)) - return; - - if (middle) - r = middle; - - slot = mast->l->offset; - - mte_mid_split_check(&l, &r, right, slot, &split, mid_split); - mas_set_split_parent(mast->l, l, r, &slot, split); - - mte_mid_split_check(&l, &r, right, slot, &split, mid_split); - mas_set_split_parent(mast->m, l, r, &slot, split); - - mte_mid_split_check(&l, &r, right, slot, &split, mid_split); - mas_set_split_parent(mast->r, l, r, &slot, split); -} - /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes @@ -2648,103 +2026,6 @@ void node_finalise(struct maple_node *node, enum maple_type mt, ma_set_meta(node, mt, gap_slot, end - 1); } -/* - * mast_cp_to_nodes() - Copy data out to nodes. - * @mast: The maple subtree state - * @left: The left encoded maple node - * @middle: The middle encoded maple node - * @right: The right encoded maple node - * @split: The location to split between left and (middle ? middle : right) - * @mid_split: The location to split between middle and right. - */ -static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, - struct maple_enode *left, struct maple_enode *middle, - struct maple_enode *right, unsigned char split, unsigned char mid_split) -{ - bool new_lmax = true; - - mas_node_or_none(mast->l, left); - mas_node_or_none(mast->m, middle); - mas_node_or_none(mast->r, right); - - mast->l->min = mast->orig_l->min; - if (split == mast->bn->b_end) { - mast->l->max = mast->orig_r->max; - new_lmax = false; - } - - mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); - - if (middle) { - mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); - mast->m->min = mast->bn->pivot[split] + 1; - split = mid_split; - } - - mast->r->max = mast->orig_r->max; - if (right) { - mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); - mast->r->min = mast->bn->pivot[split] + 1; - } -} - -/* - * mast_combine_cp_left - Copy in the original left side of the tree into the - * combined data set in the maple subtree state big node. - * @mast: The maple subtree state - */ -static inline void mast_combine_cp_left(struct maple_subtree_state *mast) -{ - unsigned char l_slot = mast->orig_l->offset; - - if (!l_slot) - return; - - mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); -} - -/* - * mast_combine_cp_right: Copy in the original right side of the tree into the - * combined data set in the maple subtree state big node. - * @mast: The maple subtree state - */ -static inline void mast_combine_cp_right(struct maple_subtree_state *mast) -{ - if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) - return; - - mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, - mt_slot_count(mast->orig_r->node), mast->bn, - mast->bn->b_end); - mast->orig_r->last = mast->orig_r->max; -} - -/* - * mast_sufficient: Check if the maple subtree state has enough data in the big - * node to create at least one sufficient node - * @mast: the maple subtree state - */ -static inline bool mast_sufficient(struct maple_subtree_state *mast) -{ - if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) - return true; - - return false; -} - -/* - * mast_overflow: Check if there is too much data in the subtree state for a - * single node. - * @mast: The maple subtree state - */ -static inline bool mast_overflow(struct maple_subtree_state *mast) -{ - if (mast->bn->b_end > mt_slot_count(mast->orig_l->node)) - return true; - - return false; -} - static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; @@ -3304,158 +2585,6 @@ static inline void cp_dst_to_slots(struct maple_copy *cp, unsigned long min, cp->max = max; } -static void mas_spanning_rebalance_loop(struct ma_state *mas, - struct maple_subtree_state *mast, unsigned char count) -{ - - unsigned char split, mid_split; - unsigned char slot = 0; - unsigned char new_height = 0; /* used if node is a new root */ - struct maple_enode *left = NULL, *middle = NULL, *right = NULL; - struct maple_enode *old_enode; - - /* - * Each level of the tree is examined and balanced, pushing data to the left or - * right, or rebalancing against left or right nodes is employed to avoid - * rippling up the tree to limit the amount of churn. Once a new sub-section of - * the tree is created, there may be a mix of new and old nodes. The old nodes - * will have the incorrect parent pointers and currently be in two trees: the - * original tree and the partially new tree. To remedy the parent pointers in - * the old tree, the new data is swapped into the active tree and a walk down - * the tree is performed and the parent pointers are updated. - * See mas_topiary_replace() for more information. - */ - while (count--) { - mast->bn->b_end--; - mast->bn->type = mte_node_type(mast->orig_l->node); - split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, - &mid_split); - mast_set_split_parents(mast, left, middle, right, split, - mid_split); - mast_cp_to_nodes(mast, left, middle, right, split, mid_split); - new_height++; - - /* - * Copy data from next level in the tree to mast->bn from next - * iteration - */ - memset(mast->bn, 0, sizeof(struct maple_big_node)); - mast->bn->type = mte_node_type(left); - - /* Root already stored in l->node. */ - if (mas_is_root_limits(mast->l)) - goto new_root; - - mast_ascend(mast); - mast_combine_cp_left(mast); - mast->l->offset = mast->bn->b_end; - mab_set_b_end(mast->bn, mast->l, left); - mab_set_b_end(mast->bn, mast->m, middle); - mab_set_b_end(mast->bn, mast->r, right); - - /* Copy anything necessary out of the right node. */ - mast_combine_cp_right(mast); - mast->orig_l->last = mast->orig_l->max; - - if (mast_sufficient(mast)) { - if (mast_overflow(mast)) - continue; - - if (mast->orig_l->node == mast->orig_r->node) { - /* - * The data in b_node should be stored in one - * node and in the tree - */ - slot = mast->l->offset; - break; - } - - continue; - } - - /* May be a new root stored in mast->bn */ - if (mas_is_root_limits(mast->orig_l)) - break; - - mast_spanning_rebalance(mast); - - /* rebalancing from other nodes may require another loop. */ - if (!count) - count++; - } - - mast->l->node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), - mte_node_type(mast->orig_l->node)); - - mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); - new_height++; - mas_set_parent(mas, left, mast->l->node, slot); - if (middle) - mas_set_parent(mas, middle, mast->l->node, ++slot); - - if (right) - mas_set_parent(mas, right, mast->l->node, ++slot); - - if (mas_is_root_limits(mast->l)) { -new_root: - mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); - while (!mte_is_root(mast->orig_l->node)) - mast_ascend(mast); - } else { - mas_mn(mast->l)->parent = mas_mn(mast->orig_l)->parent; - } - - old_enode = mast->orig_l->node; - mas->depth = mast->l->depth; - mas->node = mast->l->node; - mas->min = mast->l->min; - mas->max = mast->l->max; - mas->offset = mast->l->offset; - mas_wmb_replace(mas, old_enode, new_height); - mtree_range_walk(mas); -} - -/* - * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. - * @mas: The starting maple state - * @mast: The maple_subtree_state, keeps track of 4 maple states. - * @count: The estimated count of iterations needed. - * - * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root - * is hit. First @b_node is split into two entries which are inserted into the - * next iteration of the loop. @b_node is returned populated with the final - * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the - * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last - * to account of what has been copied into the new sub-tree. The update of - * orig_l_mas->last is used in mas_consume to find the slots that will need to - * be either freed or destroyed. orig_l_mas->depth keeps track of the height of - * the new sub-tree in case the sub-tree becomes the full tree. - */ -static void mas_spanning_rebalance(struct ma_state *mas, - struct maple_subtree_state *mast, unsigned char count) -{ - - MA_STATE(l_mas, mas->tree, mas->index, mas->index); - MA_STATE(r_mas, mas->tree, mas->index, mas->last); - MA_STATE(m_mas, mas->tree, mas->index, mas->index); - - /* - * The tree needs to be rebalanced and leaves need to be kept at the same level. - * Rebalancing is done by use of the ``struct maple_topiary``. - */ - mast->l = &l_mas; - mast->m = &m_mas; - mast->r = &r_mas; - l_mas.status = r_mas.status = m_mas.status = ma_none; - - /* Check if this is not root and has sufficient data. */ - if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && - unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) - mast_spanning_rebalance(mast); - - mas_spanning_rebalance_loop(mas, mast, count); -} - static inline bool cp_is_new_root(struct maple_copy *cp, struct ma_state *mas) { if (cp->min || cp->max != ULONG_MAX) @@ -3594,319 +2723,6 @@ static inline bool rebalance_ascend(struct maple_copy *cp, return true; } -/* - * mas_rebalance() - Rebalance a given node. - * @mas: The maple state - * @b_node: The big maple node. - * - * Rebalance two nodes into a single node or two new nodes that are sufficient. - * Continue upwards until tree is sufficient. - */ -static inline void mas_rebalance(struct ma_state *mas, - struct maple_big_node *b_node) -{ - char empty_count = mas_mt_height(mas); - struct maple_subtree_state mast; - unsigned char shift, b_end = ++b_node->b_end; - - MA_STATE(l_mas, mas->tree, mas->index, mas->last); - MA_STATE(r_mas, mas->tree, mas->index, mas->last); - - trace_ma_op(TP_FCT, mas); - - /* - * Rebalancing occurs if a node is insufficient. Data is rebalanced - * against the node to the right if it exists, otherwise the node to the - * left of this node is rebalanced against this node. If rebalancing - * causes just one node to be produced instead of two, then the parent - * is also examined and rebalanced if it is insufficient. Every level - * tries to combine the data in the same way. If one node contains the - * entire range of the tree, then that node is used as a new root node. - */ - - mast.orig_l = &l_mas; - mast.orig_r = &r_mas; - mast.bn = b_node; - mast.bn->type = mte_node_type(mas->node); - - l_mas = r_mas = *mas; - - if (mas_next_sibling(&r_mas)) { - mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); - r_mas.last = r_mas.index = r_mas.max; - } else { - mas_prev_sibling(&l_mas); - shift = mas_data_end(&l_mas) + 1; - mab_shift_right(b_node, shift); - mas->offset += shift; - mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); - b_node->b_end = shift + b_end; - l_mas.index = l_mas.last = l_mas.min; - } - - return mas_spanning_rebalance(mas, &mast, empty_count); -} - -/* - * mas_split_final_node() - Split the final node in a subtree operation. - * @mast: the maple subtree state - * @mas: The maple state - */ -static inline void mas_split_final_node(struct maple_subtree_state *mast, - struct ma_state *mas) -{ - struct maple_enode *ancestor; - - if (mte_is_root(mas->node)) { - if (mt_is_alloc(mas->tree)) - mast->bn->type = maple_arange_64; - else - mast->bn->type = maple_range_64; - } - /* - * Only a single node is used here, could be root. - * The Big_node data should just fit in a single node. - */ - ancestor = mas_new_ma_node(mas, mast->bn); - mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); - mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); - mte_to_node(ancestor)->parent = mas_mn(mas)->parent; - - mast->l->node = ancestor; - mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); - mas->offset = mast->bn->b_end - 1; -} - -/* - * mast_fill_bnode() - Copy data into the big node in the subtree state - * @mast: The maple subtree state - * @mas: the maple state - * @skip: The number of entries to skip for new nodes insertion. - */ -static inline void mast_fill_bnode(struct maple_subtree_state *mast, - struct ma_state *mas, - unsigned char skip) -{ - bool cp = true; - unsigned char split; - - memset(mast->bn, 0, sizeof(struct maple_big_node)); - - if (mte_is_root(mas->node)) { - cp = false; - } else { - mas_ascend(mas); - mas->offset = mte_parent_slot(mas->node); - } - - if (cp && mast->l->offset) - mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); - - split = mast->bn->b_end; - mab_set_b_end(mast->bn, mast->l, mast->l->node); - mast->r->offset = mast->bn->b_end; - mab_set_b_end(mast->bn, mast->r, mast->r->node); - if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) - cp = false; - - if (cp) - mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, - mast->bn, mast->bn->b_end); - - mast->bn->b_end--; - mast->bn->type = mte_node_type(mas->node); -} - -/* - * mast_split_data() - Split the data in the subtree state big node into regular - * nodes. - * @mast: The maple subtree state - * @mas: The maple state - * @split: The location to split the big node - */ -static inline void mast_split_data(struct maple_subtree_state *mast, - struct ma_state *mas, unsigned char split) -{ - unsigned char p_slot; - - mab_mas_cp(mast->bn, 0, split, mast->l, true); - mte_set_pivot(mast->r->node, 0, mast->r->max); - mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); - mast->l->offset = mte_parent_slot(mas->node); - mast->l->max = mast->bn->pivot[split]; - mast->r->min = mast->l->max + 1; - if (mte_is_leaf(mas->node)) - return; - - p_slot = mast->orig_l->offset; - mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, - &p_slot, split); - mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, - &p_slot, split); -} - -/* - * mas_push_data() - Instead of splitting a node, it is beneficial to push the - * data to the right or left node if there is room. - * @mas: The maple state - * @mast: The maple subtree state - * @left: Push left or not. - * - * Keeping the height of the tree low means faster lookups. - * - * Return: True if pushed, false otherwise. - */ -static inline bool mas_push_data(struct ma_state *mas, - struct maple_subtree_state *mast, bool left) -{ - unsigned char slot_total = mast->bn->b_end; - unsigned char end, space, split; - - MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); - tmp_mas = *mas; - tmp_mas.depth = mast->l->depth; - - if (left && !mas_prev_sibling(&tmp_mas)) - return false; - else if (!left && !mas_next_sibling(&tmp_mas)) - return false; - - end = mas_data_end(&tmp_mas); - slot_total += end; - space = 2 * mt_slot_count(mas->node) - 2; - /* -2 instead of -1 to ensure there isn't a triple split */ - if (ma_is_leaf(mast->bn->type)) - space--; - - if (mas->max == ULONG_MAX) - space--; - - if (slot_total >= space) - return false; - - /* Get the data; Fill mast->bn */ - mast->bn->b_end++; - if (left) { - mab_shift_right(mast->bn, end + 1); - mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); - mast->bn->b_end = slot_total + 1; - } else { - mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); - } - - /* Configure mast for splitting of mast->bn */ - split = mt_slots[mast->bn->type] - 2; - if (left) { - /* Switch mas to prev node */ - *mas = tmp_mas; - /* Start using mast->l for the left side. */ - tmp_mas.node = mast->l->node; - *mast->l = tmp_mas; - } else { - tmp_mas.node = mast->r->node; - *mast->r = tmp_mas; - split = slot_total - split; - } - split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); - /* Update parent slot for split calculation. */ - if (left) - mast->orig_l->offset += end + 1; - - mast_split_data(mast, mas, split); - mast_fill_bnode(mast, mas, 2); - mas_split_final_node(mast, mas); - return true; -} - -/* - * mas_split() - Split data that is too big for one node into two. - * @mas: The maple state - * @b_node: The maple big node - */ -static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) -{ - struct maple_subtree_state mast; - int height = 0; - unsigned int orig_height = mas_mt_height(mas); - unsigned char mid_split, split = 0; - struct maple_enode *old; - - /* - * Splitting is handled differently from any other B-tree; the Maple - * Tree splits upwards. Splitting up means that the split operation - * occurs when the walk of the tree hits the leaves and not on the way - * down. The reason for splitting up is that it is impossible to know - * how much space will be needed until the leaf is (or leaves are) - * reached. Since overwriting data is allowed and a range could - * overwrite more than one range or result in changing one entry into 3 - * entries, it is impossible to know if a split is required until the - * data is examined. - * - * Splitting is a balancing act between keeping allocations to a minimum - * and avoiding a 'jitter' event where a tree is expanded to make room - * for an entry followed by a contraction when the entry is removed. To - * accomplish the balance, there are empty slots remaining in both left - * and right nodes after a split. - */ - MA_STATE(l_mas, mas->tree, mas->index, mas->last); - MA_STATE(r_mas, mas->tree, mas->index, mas->last); - MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); - MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); - - trace_ma_op(TP_FCT, mas); - - mast.l = &l_mas; - mast.r = &r_mas; - mast.orig_l = &prev_l_mas; - mast.orig_r = &prev_r_mas; - mast.bn = b_node; - - while (height++ <= orig_height) { - if (mt_slots[b_node->type] > b_node->b_end) { - mas_split_final_node(&mast, mas); - break; - } - - l_mas = r_mas = *mas; - l_mas.node = mas_new_ma_node(mas, b_node); - r_mas.node = mas_new_ma_node(mas, b_node); - /* - * Another way that 'jitter' is avoided is to terminate a split up early if the - * left or right node has space to spare. This is referred to as "pushing left" - * or "pushing right" and is similar to the B* tree, except the nodes left or - * right can rarely be reused due to RCU, but the ripple upwards is halted which - * is a significant savings. - */ - /* Try to push left. */ - if (mas_push_data(mas, &mast, true)) { - height++; - break; - } - /* Try to push right. */ - if (mas_push_data(mas, &mast, false)) { - height++; - break; - } - - split = mab_calc_split(mas, b_node, &mid_split); - mast_split_data(&mast, mas, split); - /* - * Usually correct, mab_mas_cp in the above call overwrites - * r->max. - */ - mast.r->max = mas->max; - mast_fill_bnode(&mast, mas, 1); - prev_l_mas = *mast.l; - prev_r_mas = *mast.r; - } - - /* Set the original node as dead */ - old = mas->node; - mas->node = l_mas.node; - mas_wmb_replace(mas, old, height); - mtree_range_walk(mas); -} - /* * mas_root_expand() - Expand a root to a node * @mas: The maple state From 2969241fa22e39f0c751e96d2f53b1d9dfac19ba Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:33 -0500 Subject: [PATCH 035/369] maple_tree: pass maple copy node to mas_wmb_replace() mas_wmb_replace() is called in three places with the same setup, move the setup into the function itself. The function needs to be relocated as it calls mtree_range_walk(). Link: https://lkml.kernel.org/r/20260130205935.2559335-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 60 ++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 51ff311ff5b6..fe8424f4657d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1900,26 +1900,6 @@ static inline void mas_topiary_replace(struct ma_state *mas, mas_mat_destroy(mas, &subtrees); } -/* - * mas_wmb_replace() - Write memory barrier and replace - * @mas: The maple state - * @old_enode: The old maple encoded node that is being replaced. - * @new_height: The new height of the tree as a result of the operation - * - * Updates gap as necessary. - */ -static inline void mas_wmb_replace(struct ma_state *mas, - struct maple_enode *old_enode, unsigned char new_height) -{ - /* Insert the new data in the tree */ - mas_topiary_replace(mas, old_enode, new_height); - - if (mte_is_leaf(mas->node)) - return; - - mas_update_gap(mas); -} - /* * node_copy() - Copy from one node to another. * @@ -2086,6 +2066,28 @@ dead_node: return NULL; } +/* + * mas_wmb_replace() - Write memory barrier and replace + * @mas: The maple state + * @cp: The maple copy node + * + * Updates gap as necessary. + */ +static inline void mas_wmb_replace(struct ma_state *mas, struct maple_copy *cp) +{ + struct maple_enode *old_enode; + + old_enode = mas->node; + mas->node = mt_slot_locked(mas->tree, cp->slot, 0); + /* Insert the new data in the tree */ + mas_topiary_replace(mas, old_enode, cp->height); + if (!mte_is_leaf(mas->node)) + mas_update_gap(mas); + + mtree_range_walk(mas); +} + + /* * cp_leaf_init() - Initialize a maple_copy node for the leaf level of a * spanning store @@ -3044,7 +3046,6 @@ done: */ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { - struct maple_enode *old_enode; struct maple_copy cp; struct ma_state *mas; struct ma_state sib; @@ -3112,10 +3113,7 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) cp_data_write(&cp, mas); } while (spanning_ascend(&cp, mas, wr_mas, &r_wr_mas, &sib)); - old_enode = mas->node; - mas->node = mt_slot_locked(mas->tree, cp.slot, 0); - mas_wmb_replace(mas, old_enode, cp.height); - mtree_range_walk(mas); + mas_wmb_replace(mas, &cp); } /* @@ -3433,7 +3431,6 @@ static inline void split_data(struct maple_copy *cp, */ static void mas_wr_split(struct ma_wr_state *wr_mas) { - struct maple_enode *old_enode; struct ma_state parent; struct ma_state *mas; struct maple_copy cp; @@ -3454,10 +3451,7 @@ static void mas_wr_split(struct ma_wr_state *wr_mas) cp_data_write(&cp, mas); } while (split_ascend(&cp, wr_mas, &sib, &parent)); - old_enode = mas->node; - mas->node = mt_slot_locked(mas->tree, cp.slot, 0); - mas_wmb_replace(mas, old_enode, cp.height); - mtree_range_walk(mas); + mas_wmb_replace(mas, &cp); } /* @@ -3470,7 +3464,6 @@ static void mas_wr_split(struct ma_wr_state *wr_mas) */ static void mas_wr_rebalance(struct ma_wr_state *wr_mas) { - struct maple_enode *old_enode; struct ma_state parent; struct ma_state *mas; struct maple_copy cp; @@ -3501,10 +3494,7 @@ static void mas_wr_rebalance(struct ma_wr_state *wr_mas) cp_data_write(&cp, mas); } while (rebalance_ascend(&cp, wr_mas, &sib, &parent)); - old_enode = mas->node; - mas->node = mt_slot_locked(mas->tree, cp.slot, 0); - mas_wmb_replace(mas, old_enode, cp.height); - mtree_range_walk(mas); + mas_wmb_replace(mas, &cp); } /* From b82f4c811e4d22b308dccf9f7b0a382b0105c190 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:34 -0500 Subject: [PATCH 036/369] maple_tree: don't pass end to mas_wr_append() Figure out the end internally. This is necessary for future cleanups. Link: https://lkml.kernel.org/r/20260130205935.2559335-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index fe8424f4657d..46111912b26b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3309,18 +3309,17 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state - * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. */ -static inline void mas_wr_append(struct ma_wr_state *wr_mas, - unsigned char new_end) +static inline void mas_wr_append(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; void __rcu **slots; unsigned char end = mas->end; + unsigned char new_end = mas_wr_new_end(wr_mas); if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; @@ -3513,7 +3512,7 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) mas_update_gap(mas); break; case wr_append: - mas_wr_append(wr_mas, new_end); + mas_wr_append(wr_mas); break; case wr_slot_store: mas_wr_slot_store(wr_mas); From 0e8cf9a31a8c3f996c48b633a673076d6997b093 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Jan 2026 15:59:35 -0500 Subject: [PATCH 037/369] maple_tree: clean up mas_wr_node_store() The new_end does not need to be passed in as the data is already being checked. This allows for other areas to skip getting the node new_end in the calling function. The type was incorrectly void * instead of void __rcu *, which isn't an issue but is technically incorrect. Move the variable assignment to after the declarations to clean up the initial setup. Ensure there is something to copy before calling memcpy(). Link: https://lkml.kernel.org/r/20260130205935.2559335-31-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Alice Ryhl Cc: Andrew Ballance Cc: Arnd Bergmann Cc: Christian Kujau Cc: Geert Uytterhoeven Cc: Kuninori Morimoto Cc: Matthew Wilcox (Oracle) Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/maple_tree.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 46111912b26b..d18d7ed9ab67 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3122,20 +3122,28 @@ static void mas_wr_spanning_store(struct ma_wr_state *wr_mas) * * Attempts to reuse the node, but may allocate. */ -static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, - unsigned char new_end) +static inline void mas_wr_node_store(struct ma_wr_state *wr_mas) { - struct ma_state *mas = wr_mas->mas; - void __rcu **dst_slots; - unsigned long *dst_pivots; - unsigned char dst_offset, offset_end = wr_mas->offset_end; + unsigned char dst_offset, offset_end; + unsigned char copy_size, node_pivots; struct maple_node reuse, *newnode; - unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; - bool in_rcu = mt_in_rcu(mas->tree); - unsigned char height = mas_mt_height(mas); + unsigned long *dst_pivots; + void __rcu **dst_slots; + unsigned char new_end; + struct ma_state *mas; + bool in_rcu; - if (mas->last == wr_mas->end_piv) + mas = wr_mas->mas; + trace_ma_op(TP_FCT, mas); + in_rcu = mt_in_rcu(mas->tree); + offset_end = wr_mas->offset_end; + node_pivots = mt_pivots[wr_mas->type]; + /* Assume last adds an entry */ + new_end = mas->end + 1 - offset_end + mas->offset; + if (mas->last == wr_mas->end_piv) { offset_end++; /* don't copy this offset */ + new_end--; + } /* set up node. */ if (in_rcu) { @@ -3149,13 +3157,16 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ - memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); - memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); + if (mas->offset) { + memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); + memcpy(dst_slots, wr_mas->slots, sizeof(void __rcu *) * mas->offset); + } /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; + new_end++; } /* Store the new entry and range end. */ @@ -3174,7 +3185,7 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, - sizeof(void *) * copy_size); + sizeof(void __rcu *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); @@ -3187,7 +3198,7 @@ done: struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); - mas_replace_node(mas, old_enode, height); + mas_replace_node(mas, old_enode, mas_mt_height(mas)); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } @@ -3503,7 +3514,6 @@ static void mas_wr_rebalance(struct ma_wr_state *wr_mas) static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; - unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { case wr_exact_fit: @@ -3518,7 +3528,7 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) mas_wr_slot_store(wr_mas); break; case wr_node_store: - mas_wr_node_store(wr_mas, new_end); + mas_wr_node_store(wr_mas); break; case wr_spanning_store: mas_wr_spanning_store(wr_mas); From 3d443691ed00eff21ca049ac49c584cdcc7134ac Mon Sep 17 00:00:00 2001 From: Seongsu Park Date: Thu, 19 Feb 2026 15:35:06 +0900 Subject: [PATCH 038/369] mm/pkeys: remove unused tsk parameter from arch_set_user_pkey_access() The tsk parameter in arch_set_user_pkey_access() is never used in the function implementations across all architectures (arm64, powerpc, x86). Link: https://lkml.kernel.org/r/20260219063506.545148-1-sgsu.park@samsung.com Signed-off-by: Seongsu Park Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Ingo Molnar Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pkeys.h | 3 +-- arch/arm64/mm/mmu.c | 2 +- arch/powerpc/include/asm/pkeys.h | 8 +++----- arch/powerpc/mm/book3s64/pkeys.c | 3 +-- arch/x86/include/asm/pkeys.h | 3 +-- arch/x86/kernel/fpu/xstate.c | 3 +-- arch/x86/mm/pkeys.c | 3 +-- mm/mprotect.c | 2 +- 8 files changed, 10 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/pkeys.h b/arch/arm64/include/asm/pkeys.h index 0ca5f83ce148..cd286d3a9c94 100644 --- a/arch/arm64/include/asm/pkeys.h +++ b/arch/arm64/include/asm/pkeys.h @@ -12,8 +12,7 @@ #define arch_max_pkey() 8 -int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); +int arch_set_user_pkey_access(int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a6a00accf4f9..a12ea8776c32 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -2206,7 +2206,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) } #ifdef CONFIG_ARCH_HAS_PKEYS -int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) +int arch_set_user_pkey_access(int pkey, unsigned long init_val) { u64 new_por; u64 old_por; diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index 28e752138996..bd20d4106471 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -143,10 +143,8 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, return __arch_override_mprotect_pkey(vma, prot, pkey); } -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); -static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) +extern int __arch_set_user_pkey_access(int pkey, unsigned long init_val); +static inline int arch_set_user_pkey_access(int pkey, unsigned long init_val) { if (!mmu_has_feature(MMU_FTR_PKEY)) return -EINVAL; @@ -160,7 +158,7 @@ static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, if (pkey == 0) return init_val ? -EINVAL : 0; - return __arch_set_user_pkey_access(tsk, pkey, init_val); + return __arch_set_user_pkey_access(pkey, init_val); } static inline bool arch_pkeys_enabled(void) diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index a974baf8f327..2002331b05ba 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -335,8 +335,7 @@ static inline void init_iamr(int pkey, u8 init_bits) * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that * specified in @init_val. */ -int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) +int __arch_set_user_pkey_access(int pkey, unsigned long init_val) { u64 new_amr_bits = 0x0ul; u64 new_iamr_bits = 0x0ul; diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 2e6c04d8a45b..06ed2cd2592e 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -9,8 +9,7 @@ */ #define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1) -extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); +extern int arch_set_user_pkey_access(int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 76153dfb58c9..a7b6524a9dea 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1080,8 +1080,7 @@ void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_ * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. */ -int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) +int arch_set_user_pkey_access(int pkey, unsigned long init_val) { u32 old_pkru, new_pkru_bits = 0; int pkey_shift; diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 7418c367e328..1dfcfaf77e23 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -42,8 +42,7 @@ int __execute_only_pkey(struct mm_struct *mm) * Set up PKRU so that it denies access for everything * other than execution. */ - ret = arch_set_user_pkey_access(current, execute_only_pkey, - PKEY_DISABLE_ACCESS); + ret = arch_set_user_pkey_access(execute_only_pkey, PKEY_DISABLE_ACCESS); /* * If the PKRU-set operation failed somehow, just return * 0 and effectively disable execute-only support. diff --git a/mm/mprotect.c b/mm/mprotect.c index c0571445bef7..9681f055b9fc 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -978,7 +978,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) if (pkey == -1) goto out; - ret = arch_set_user_pkey_access(current, pkey, init_val); + ret = arch_set_user_pkey_access(pkey, init_val); if (ret) { mm_pkey_free(current->mm, pkey); goto out; From 54218f10dfbe88c8e41c744fd45a756cde60b8c4 Mon Sep 17 00:00:00 2001 From: AnishMulay Date: Wed, 18 Feb 2026 11:39:41 -0500 Subject: [PATCH 039/369] selftests/mm: skip migration tests if NUMA is unavailable Currently, the migration test asserts that numa_available() returns 0. On systems where NUMA is not available (returning -1), such as certain ARM64 configurations or single-node systems, this assertion fails and crashes the test. Update the test to check the return value of numa_available(). If it is less than 0, skip the test gracefully instead of failing. This aligns the behavior with other MM selftests (like rmap) that skip when NUMA support is missing. Link: https://lkml.kernel.org/r/20260218163941.13499-1-anishm7030@gmail.com Fixes: 0c2d08728470 ("mm: add selftests for migration entries") Signed-off-by: AnishMulay Reviewed-by: SeongJae Park Reviewed-by: Dev Jain Reviewed-by: Anshuman Khandual Tested-by: Sayali Patil Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/migration.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index ee24b88c2b24..60e78bbfc0e3 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -36,7 +36,8 @@ FIXTURE_SETUP(migration) { int n; - ASSERT_EQ(numa_available(), 0); + if (numa_available() < 0) + SKIP(return, "NUMA not available"); self->nthreads = numa_num_task_cpus() - 1; self->n1 = -1; self->n2 = -1; From e4f4fc7aa8b720d934a0bfcea7f8aae4271d308f Mon Sep 17 00:00:00 2001 From: "JP Kobryn (Meta)" Date: Thu, 19 Feb 2026 15:58:46 -0800 Subject: [PATCH 040/369] mm: move pgscan, pgsteal, pgrefill to node stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are situations where reclaim kicks in on a system with free memory. One possible cause is a NUMA imbalance scenario where one or more nodes are under pressure. It would help if we could easily identify such nodes. Move the pgscan, pgsteal, and pgrefill counters from vm_event_item to node_stat_item to provide per-node reclaim visibility. With these counters as node stats, the values are now displayed in the per-node section of /proc/zoneinfo, which allows for quick identification of the affected nodes. /proc/vmstat continues to report the same counters, aggregated across all nodes. But the ordering of these items within the readout changes as they move from the vm events section to the node stats section. Memcg accounting of these counters is preserved. The relocated counters remain visible in memory.stat alongside the existing aggregate pgscan and pgsteal counters. However, this change affects how the global counters are accumulated. Previously, the global event count update was gated on !cgroup_reclaim(), excluding memcg-based reclaim from /proc/vmstat. Now that mod_lruvec_state() is being used to update the counters, the global counters will include all reclaim. This is consistent with how pgdemote counters are already tracked. Finally, the virtio_balloon driver is updated to use global_node_page_state() to fetch the counters, as they are no longer accessible through the vm_events array. Link: https://lkml.kernel.org/r/20260219235846.161910-1-jp.kobryn@linux.dev Signed-off-by: JP Kobryn Suggested-by: Johannes Weiner Acked-by: Michael S. Tsirkin Reviewed-by: Vlastimil Babka (SUSE) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Alistair Popple Cc: Axel Rasmussen Cc: Byungchul Park Cc: David Hildenbrand Cc: Eugenio Pérez Cc: Gregory Price Cc: "Huang, Ying" Cc: Jason Wang Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Mike Rapoport Cc: Muchun Song Cc: Qi Zheng Cc: Rakie Kim Cc: Roman Gushchin Cc: Suren Baghdasaryan Cc: Wei Xu Cc: Xuan Zhuo Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/virtio/virtio_balloon.c | 8 ++--- include/linux/mmzone.h | 13 ++++++++ include/linux/vm_event_item.h | 13 -------- mm/memcontrol.c | 56 +++++++++++++++++++++++---------- mm/vmscan.c | 39 ++++++++--------------- mm/vmstat.c | 26 +++++++-------- 6 files changed, 82 insertions(+), 73 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index d1fbc8fe8470..7f15bf162e88 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -369,13 +369,13 @@ static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb) update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall); update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_SCAN, - pages_to_bytes(events[PGSCAN_KSWAPD])); + pages_to_bytes(global_node_page_state(PGSCAN_KSWAPD))); update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_SCAN, - pages_to_bytes(events[PGSCAN_DIRECT])); + pages_to_bytes(global_node_page_state(PGSCAN_DIRECT))); update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_RECLAIM, - pages_to_bytes(events[PGSTEAL_KSWAPD])); + pages_to_bytes(global_node_page_state(PGSTEAL_KSWAPD))); update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_RECLAIM, - pages_to_bytes(events[PGSTEAL_DIRECT])); + pages_to_bytes(global_node_page_state(PGSTEAL_DIRECT))); #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e51190a55e4..546bca95ca40 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -255,6 +255,19 @@ enum node_stat_item { PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, + PGSTEAL_ANON, + PGSTEAL_FILE, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, + PGSCAN_ANON, + PGSCAN_FILE, + PGREFILL, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 22a139f82d75..03fe95f5a020 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -38,21 +38,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, - PGREFILL, PGREUSE, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGSTEAL_KHUGEPAGED, - PGSTEAL_PROACTIVE, - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSCAN_KHUGEPAGED, - PGSCAN_PROACTIVE, PGSCAN_DIRECT_THROTTLE, - PGSCAN_ANON, - PGSCAN_FILE, - PGSTEAL_ANON, - PGSTEAL_FILE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_SUCCESS, PGSCAN_ZONE_RECLAIM_FAILED, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 772bac21d155..af75f10150a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -330,6 +330,19 @@ static const unsigned int memcg_node_stat_items[] = { PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, + PGSTEAL_ANON, + PGSTEAL_FILE, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, + PGSCAN_ANON, + PGSCAN_FILE, + PGREFILL, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif @@ -443,17 +456,8 @@ static const unsigned int memcg_vm_event_stat[] = { #endif PSWPIN, PSWPOUT, - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSCAN_KHUGEPAGED, - PGSCAN_PROACTIVE, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGSTEAL_KHUGEPAGED, - PGSTEAL_PROACTIVE, PGFAULT, PGMAJFAULT, - PGREFILL, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, @@ -1400,6 +1404,15 @@ static const struct memory_stat memory_stats[] = { { "pgdemote_direct", PGDEMOTE_DIRECT }, { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED }, { "pgdemote_proactive", PGDEMOTE_PROACTIVE }, + { "pgsteal_kswapd", PGSTEAL_KSWAPD }, + { "pgsteal_direct", PGSTEAL_DIRECT }, + { "pgsteal_khugepaged", PGSTEAL_KHUGEPAGED }, + { "pgsteal_proactive", PGSTEAL_PROACTIVE }, + { "pgscan_kswapd", PGSCAN_KSWAPD }, + { "pgscan_direct", PGSCAN_DIRECT }, + { "pgscan_khugepaged", PGSCAN_KHUGEPAGED }, + { "pgscan_proactive", PGSCAN_PROACTIVE }, + { "pgrefill", PGREFILL }, #ifdef CONFIG_NUMA_BALANCING { "pgpromote_success", PGPROMOTE_SUCCESS }, #endif @@ -1443,6 +1456,15 @@ static int memcg_page_state_output_unit(int item) case PGDEMOTE_DIRECT: case PGDEMOTE_KHUGEPAGED: case PGDEMOTE_PROACTIVE: + case PGSTEAL_KSWAPD: + case PGSTEAL_DIRECT: + case PGSTEAL_KHUGEPAGED: + case PGSTEAL_PROACTIVE: + case PGSCAN_KSWAPD: + case PGSCAN_DIRECT: + case PGSCAN_KHUGEPAGED: + case PGSCAN_PROACTIVE: + case PGREFILL: #ifdef CONFIG_NUMA_BALANCING case PGPROMOTE_SUCCESS: #endif @@ -1514,15 +1536,15 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) /* Accumulated memory events */ seq_buf_printf(s, "pgscan %lu\n", - memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT) + - memcg_events(memcg, PGSCAN_PROACTIVE) + - memcg_events(memcg, PGSCAN_KHUGEPAGED)); + memcg_page_state(memcg, PGSCAN_KSWAPD) + + memcg_page_state(memcg, PGSCAN_DIRECT) + + memcg_page_state(memcg, PGSCAN_PROACTIVE) + + memcg_page_state(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(s, "pgsteal %lu\n", - memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT) + - memcg_events(memcg, PGSTEAL_PROACTIVE) + - memcg_events(memcg, PGSTEAL_KHUGEPAGED)); + memcg_page_state(memcg, PGSTEAL_KSWAPD) + + memcg_page_state(memcg, PGSTEAL_DIRECT) + + memcg_page_state(memcg, PGSTEAL_PROACTIVE) + + memcg_page_state(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { #ifdef CONFIG_MEMCG_V1 diff --git a/mm/vmscan.c b/mm/vmscan.c index 0fc9373e8251..031c5c035a82 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1984,7 +1984,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, unsigned long nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); - enum vm_event_item item; + enum node_stat_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; @@ -2010,10 +2010,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); item = PGSCAN_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); - __count_vm_events(PGSCAN_ANON + file, nr_scanned); + mod_lruvec_state(lruvec, item, nr_scanned); + mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2030,10 +2028,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); + mod_lruvec_state(lruvec, item, nr_reclaimed); + mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed); lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); @@ -2120,9 +2116,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - if (!cgroup_reclaim(sc)) - __count_vm_events(PGREFILL, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + mod_lruvec_state(lruvec, PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -4543,7 +4537,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, { int i; int gen; - enum vm_event_item item; + enum node_stat_item item; int sorted = 0; int scanned = 0; int isolated = 0; @@ -4551,7 +4545,6 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, int scan_batch = min(nr_to_scan, MAX_LRU_BATCH); int remaining = scan_batch; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); @@ -4602,13 +4595,9 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, } item = PGSCAN_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) { - __count_vm_events(item, isolated); - __count_vm_events(PGREFILL, sorted); - } - count_memcg_events(memcg, item, isolated); - count_memcg_events(memcg, PGREFILL, sorted); - __count_vm_events(PGSCAN_ANON + type, isolated); + mod_lruvec_state(lruvec, item, isolated); + mod_lruvec_state(lruvec, PGREFILL, sorted); + mod_lruvec_state(lruvec, PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, scan_batch, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); @@ -4693,7 +4682,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, LIST_HEAD(clean); struct folio *folio; struct folio *next; - enum vm_event_item item; + enum node_stat_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; @@ -4757,10 +4746,8 @@ retry: stat.nr_demoted); item = PGSTEAL_KSWAPD + reclaimer_offset(sc); - if (!cgroup_reclaim(sc)) - __count_vm_events(item, reclaimed); - count_memcg_events(memcg, item, reclaimed); - __count_vm_events(PGSTEAL_ANON + type, reclaimed); + mod_lruvec_state(lruvec, item, reclaimed); + mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); diff --git a/mm/vmstat.c b/mm/vmstat.c index 86b14b0f77b5..44bbb7752f11 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1276,6 +1276,19 @@ const char * const vmstat_text[] = { [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged", [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive", + [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", + [I(PGSTEAL_DIRECT)] = "pgsteal_direct", + [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", + [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", + [I(PGSTEAL_ANON)] = "pgsteal_anon", + [I(PGSTEAL_FILE)] = "pgsteal_file", + [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", + [I(PGSCAN_DIRECT)] = "pgscan_direct", + [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", + [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", + [I(PGSCAN_ANON)] = "pgscan_anon", + [I(PGSCAN_FILE)] = "pgscan_file", + [I(PGREFILL)] = "pgrefill", #ifdef CONFIG_HUGETLB_PAGE [I(NR_HUGETLB)] = "nr_hugetlb", #endif @@ -1318,21 +1331,8 @@ const char * const vmstat_text[] = { [I(PGMAJFAULT)] = "pgmajfault", [I(PGLAZYFREED)] = "pglazyfreed", - [I(PGREFILL)] = "pgrefill", [I(PGREUSE)] = "pgreuse", - [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd", - [I(PGSTEAL_DIRECT)] = "pgsteal_direct", - [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged", - [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive", - [I(PGSCAN_KSWAPD)] = "pgscan_kswapd", - [I(PGSCAN_DIRECT)] = "pgscan_direct", - [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged", - [I(PGSCAN_PROACTIVE)] = "pgscan_proactive", [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle", - [I(PGSCAN_ANON)] = "pgscan_anon", - [I(PGSCAN_FILE)] = "pgscan_file", - [I(PGSTEAL_ANON)] = "pgsteal_anon", - [I(PGSTEAL_FILE)] = "pgsteal_file", #ifdef CONFIG_NUMA [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success", From e623b4ebee9d3a4b1e408b2c3e60cfc99b4e61ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Fri, 20 Feb 2026 00:44:07 +0100 Subject: [PATCH 041/369] mm: fix typo in the comment of mod_zone_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the proper function name, followed by parenthesis as usual. Link: https://lkml.kernel.org/r/20260219234407.3261196-1-mssola@mssola.com Signed-off-by: Miquel Sabaté Solà Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes (Oracle) Cc: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 44bbb7752f11..667474773dbc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -547,7 +547,7 @@ EXPORT_SYMBOL(__dec_node_page_state); #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead - * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. + * that comes with local_irq_save/restore if we use this_cpu_try_cmpxchg(). * * mod_state() modifies the zone counter state through atomic per cpu * operations. From eca4d01b982db4c104f6c83b9ac167c269c4310a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:26 +0800 Subject: [PATCH 042/369] mm, swap: protect si->swap_file properly and use as a mount indicator Patch series "mm, swap: swap table phase III: remove swap_map", v3. This series removes the static swap_map and uses the swap table for the swap count directly. This saves about ~30% memory usage for the static swap metadata. For example, this saves 256MB of memory when mounting a 1TB swap device. Performance is slightly better too, since the double update of the swap table and swap_map is now gone. Test results: Mounting a swap device: ======================= Mount a 1TB brd device as SWAP, just to verify the memory save: `free -m` before: total used free shared buff/cache available Mem: 1465 1051 417 1 61 413 Swap: 1054435 0 1054435 `free -m` after: total used free shared buff/cache available Mem: 1465 795 672 1 62 670 Swap: 1054435 0 1054435 Idle memory usage is reduced by ~256MB just as expected. And following this design we should be able to save another ~512MB in a next phase. Build kernel test: ================== Test using ZSWAP with NVME SWAP, make -j48, defconfig, in a x86_64 VM with 5G RAM, under global pressure, avg of 32 test run: Before After: System time: 1038.97s 1013.75s (-2.4%) Test using ZRAM as SWAP, make -j12, tinyconfig, in a ARM64 VM with 1.5G RAM, under global pressure, avg of 32 test run: Before After: System time: 67.75s 66.65s (-1.6%) The result is slightly better. Redis / Valkey benchmark: ========================= Test using ZRAM as SWAP, in a ARM64 VM with 1.5G RAM, under global pressure, avg of 64 test run: Server: valkey-server --maxmemory 2560M Client: redis-benchmark -r 3000000 -n 3000000 -d 1024 -c 12 -P 32 -t get no persistence with BGSAVE Before: 472705.71 RPS 369451.68 RPS After: 481197.93 RPS (+1.8%) 374922.32 RPS (+1.5%) In conclusion, performance is better in all cases, and memory usage is much lower. The swap cgroup array will also be merged into the swap table in a later phase, saving the other ~60% part of the static swap metadata and making all the swap metadata dynamic. The improved API for swap operations also reduces the lock contention and makes more batching operations possible. This patch (of 12): /proc/swaps uses si->swap_map as the indicator to check if the swap device is mounted. swap_map will be removed soon, so change it to use si->swap_file instead because: - si->swap_file is exactly the only dynamic content that /proc/swaps is interested in. Previously, it was checking si->swap_map just to ensure si->swap_file is available. si->swap_map is set under mutex protection, and after si->swap_file is set, so having si->swap_map set guarantees si->swap_file is set. - Checking si->flags doesn't work here. SWP_WRITEOK is cleared during swapoff, but /proc/swaps is supposed to show the device under swapoff too to report the swapoff progress. And SWP_USED is set even if the device hasn't been properly set up. We can have another flag, but the easier way is to just check si->swap_file directly. So protect si->swap_file setting with mutext, and set si->swap_file only when the swap device is truly enabled. /proc/swaps only interested in si->swap_file and a few static data reading. Only si->swap_file needs protection. Reading other static fields is always fine. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-0-f4e34be021a7@tencent.com Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-1-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Nhat Pham Cc: Kairui Song Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/swapfile.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 90132b74d6a0..281ee2762358 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -110,6 +110,7 @@ struct swap_info_struct *swap_info[MAX_SWAPFILES]; static struct kmem_cache *swap_table_cachep; +/* Protects si->swap_file for /proc/swaps usage */ static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); @@ -2532,7 +2533,8 @@ static void drain_mmlist(void) /* * Free all of a swapdev's extent information */ -static void destroy_swap_extents(struct swap_info_struct *sis) +static void destroy_swap_extents(struct swap_info_struct *sis, + struct file *swap_file) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; @@ -2543,7 +2545,6 @@ static void destroy_swap_extents(struct swap_info_struct *sis) } if (sis->flags & SWP_ACTIVATED) { - struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; @@ -2626,9 +2627,9 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ -static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) +static int setup_swap_extents(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) { - struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; @@ -2646,7 +2647,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { - destroy_swap_extents(sis); + destroy_swap_extents(sis, swap_file); return -ENOMEM; } return ret; @@ -2857,7 +2858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) flush_work(&p->reclaim_work); flush_percpu_swap_cluster(p); - destroy_swap_extents(p); + destroy_swap_extents(p, p->swap_file); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); @@ -2945,7 +2946,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_info(type)); type++) { - if (!(si->flags & SWP_USED) || !si->swap_map) + if (!(si->swap_file)) continue; if (!--l) return si; @@ -2966,7 +2967,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) ++(*pos); for (; (si = swap_type_to_info(type)); type++) { - if (!(si->flags & SWP_USED) || !si->swap_map) + if (!(si->swap_file)) continue; return si; } @@ -3376,7 +3377,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - si->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; @@ -3426,7 +3426,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) si->max = maxpages; si->pages = maxpages - 1; - nr_extents = setup_swap_extents(si, &span); + nr_extents = setup_swap_extents(si, swap_file, &span); if (nr_extents < 0) { error = nr_extents; goto bad_swap_unlock_inode; @@ -3535,6 +3535,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) prio = DEF_SWAP_PRIO; if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; + + si->swap_file = swap_file; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", @@ -3559,10 +3561,9 @@ bad_swap: kfree(si->global_cluster); si->global_cluster = NULL; inode = NULL; - destroy_swap_extents(si); + destroy_swap_extents(si, swap_file); swap_cgroup_swapoff(si->type); spin_lock(&swap_lock); - si->swap_file = NULL; si->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); From 451c6326105b22a97d77669867be10370ff3b78e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:27 +0800 Subject: [PATCH 043/369] mm, swap: clean up swapon process and locking Slightly clean up the swapon process. Add comments about what swap_lock protects, introduce and rename helpers that wrap swap_map and cluster_info setup, and do it outside of the swap_lock lock. This lock protection is not needed for swap_map and cluster_info setup because all swap users must either hold the percpu ref or hold a stable allocated swap entry (e.g., locking a folio in the swap cache) before accessing. So before the swap device is exposed by enable_swap_info, nothing would use the swap device's map or cluster. So we are safe to allocate and set up swap data freely first, then expose the swap device and set the SWP_WRITEOK flag. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-2-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 91 ++++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 281ee2762358..a795cae82415 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -65,6 +65,13 @@ static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags); +/* + * Protects the swap_info array, and the SWP_USED flag. swap_info contains + * lazily allocated & freed swap device info struts, and SWP_USED indicates + * which device is used, ~SWP_USED devices and can be reused. + * + * Also protects swap_active_head total_swap_pages, and the SWP_WRITEOK flag. + */ static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; @@ -2657,8 +2664,6 @@ static int setup_swap_extents(struct swap_info_struct *sis, } static void setup_swap_info(struct swap_info_struct *si, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info, unsigned long *zeromap) { si->prio = prio; @@ -2668,8 +2673,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, */ si->list.prio = -si->prio; si->avail_list.prio = -si->prio; - si->swap_map = swap_map; - si->cluster_info = cluster_info; si->zeromap = zeromap; } @@ -2687,13 +2690,11 @@ static void _enable_swap_info(struct swap_info_struct *si) } static void enable_swap_info(struct swap_info_struct *si, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info, - unsigned long *zeromap) + unsigned long *zeromap) { spin_lock(&swap_lock); spin_lock(&si->lock); - setup_swap_info(si, prio, swap_map, cluster_info, zeromap); + setup_swap_info(si, prio, zeromap); spin_unlock(&si->lock); spin_unlock(&swap_lock); /* @@ -2711,7 +2712,7 @@ static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); - setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); + setup_swap_info(si, si->prio, si->zeromap); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); @@ -2735,8 +2736,8 @@ static void wait_for_allocation(struct swap_info_struct *si) } } -static void free_cluster_info(struct swap_cluster_info *cluster_info, - unsigned long maxpages) +static void free_swap_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) { struct swap_cluster_info *ci; int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); @@ -2889,7 +2890,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); - free_cluster_info(cluster_info, maxpages); + free_swap_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -3236,10 +3237,15 @@ static unsigned long read_swap_header(struct swap_info_struct *si, static int setup_swap_map(struct swap_info_struct *si, union swap_header *swap_header, - unsigned char *swap_map, unsigned long maxpages) { unsigned long i; + unsigned char *swap_map; + + swap_map = vzalloc(maxpages); + si->swap_map = swap_map; + if (!swap_map) + return -ENOMEM; swap_map[0] = SWAP_MAP_BAD; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { @@ -3260,9 +3266,9 @@ static int setup_swap_map(struct swap_info_struct *si, return 0; } -static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned long maxpages) +static int setup_swap_clusters_info(struct swap_info_struct *si, + union swap_header *swap_header, + unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; @@ -3331,10 +3337,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } } - return cluster_info; + si->cluster_info = cluster_info; + return 0; err: - free_cluster_info(cluster_info, maxpages); - return ERR_PTR(err); + free_swap_cluster_info(cluster_info, maxpages); + return err; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) @@ -3349,9 +3356,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int nr_extents; sector_t span; unsigned long maxpages; - unsigned char *swap_map = NULL; unsigned long *zeromap = NULL; - struct swap_cluster_info *cluster_info = NULL; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3362,6 +3367,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * Allocate or reuse existing !SWP_USED swap_info. The returned + * si will stay in a dying status, so nothing will access its content + * until enable_swap_info resurrects its percpu ref and expose it. + */ si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -3439,18 +3449,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) maxpages = si->max; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vzalloc(maxpages); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } - - error = swap_cgroup_swapon(si->type, maxpages); + /* Setup the swap map and apply bad block */ + error = setup_swap_map(si, swap_header, maxpages); if (error) goto bad_swap_unlock_inode; - error = setup_swap_map(si, swap_header, swap_map, maxpages); + /* Set up the swap cluster info */ + error = setup_swap_clusters_info(si, swap_header, maxpages); + if (error) + goto bad_swap_unlock_inode; + + error = swap_cgroup_swapon(si->type, maxpages); if (error) goto bad_swap_unlock_inode; @@ -3478,13 +3487,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) inced_nr_rotate_swap = true; } - cluster_info = setup_clusters(si, swap_header, maxpages); - if (IS_ERR(cluster_info)) { - error = PTR_ERR(cluster_info); - cluster_info = NULL; - goto bad_swap_unlock_inode; - } - if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* @@ -3537,7 +3539,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) prio = swap_flags & SWAP_FLAG_PRIO_MASK; si->swap_file = swap_file; - enable_swap_info(si, prio, swap_map, cluster_info, zeromap); + + /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */ + enable_swap_info(si, prio, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, @@ -3563,13 +3567,18 @@ bad_swap: inode = NULL; destroy_swap_extents(si, swap_file); swap_cgroup_swapoff(si->type); + vfree(si->swap_map); + si->swap_map = NULL; + free_swap_cluster_info(si->cluster_info, si->max); + si->cluster_info = NULL; + /* + * Clear the SWP_USED flag after all resources are freed so + * alloc_swap_info can reuse this si safely. + */ spin_lock(&swap_lock); si->flags = 0; spin_unlock(&swap_lock); - vfree(swap_map); kvfree(zeromap); - if (cluster_info) - free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) From 50f8c41928fd82a8062474e4f22ac1a6be15ddea Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:28 +0800 Subject: [PATCH 044/369] mm, swap: remove redundant arguments and locking for enabling a device There is no need to repeatedly pass zero map and priority values. zeromap is similar to cluster info and swap_map, which are only used once the swap device is exposed. And the prio values are currently read only once set, and only used for the list insertion upon expose or swap info display. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-3-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 48 ++++++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a795cae82415..650eae8545f1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2663,19 +2663,6 @@ static int setup_swap_extents(struct swap_info_struct *sis, return generic_swapfile_activate(sis, swap_file, span); } -static void setup_swap_info(struct swap_info_struct *si, int prio, - unsigned long *zeromap) -{ - si->prio = prio; - /* - * the plist prio is negated because plist ordering is - * low-to-high, while swap ordering is high-to-low - */ - si->list.prio = -si->prio; - si->avail_list.prio = -si->prio; - si->zeromap = zeromap; -} - static void _enable_swap_info(struct swap_info_struct *si) { atomic_long_add(si->pages, &nr_swap_pages); @@ -2689,17 +2676,12 @@ static void _enable_swap_info(struct swap_info_struct *si) add_to_avail_list(si, true); } -static void enable_swap_info(struct swap_info_struct *si, int prio, - unsigned long *zeromap) +/* + * Called after the swap device is ready, resurrect its percpu ref, it's now + * safe to reference it. Add it to the list to expose it to the allocator. + */ +static void enable_swap_info(struct swap_info_struct *si) { - spin_lock(&swap_lock); - spin_lock(&si->lock); - setup_swap_info(si, prio, zeromap); - spin_unlock(&si->lock); - spin_unlock(&swap_lock); - /* - * Finished initializing swap device, now it's safe to reference it. - */ percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); spin_lock(&si->lock); @@ -2712,7 +2694,6 @@ static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); - setup_swap_info(si, si->prio, si->zeromap); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); @@ -3356,7 +3337,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int nr_extents; sector_t span; unsigned long maxpages; - unsigned long *zeromap = NULL; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3467,9 +3447,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ - zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), - GFP_KERNEL | __GFP_ZERO); - if (!zeromap) { + si->zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + GFP_KERNEL | __GFP_ZERO); + if (!si->zeromap) { error = -ENOMEM; goto bad_swap_unlock_inode; } @@ -3538,10 +3518,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; + /* + * The plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + si->prio = prio; + si->list.prio = -si->prio; + si->avail_list.prio = -si->prio; si->swap_file = swap_file; /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */ - enable_swap_info(si, prio, zeromap); + enable_swap_info(si); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, @@ -3571,6 +3558,8 @@ bad_swap: si->swap_map = NULL; free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; + kvfree(si->zeromap); + si->zeromap = NULL; /* * Clear the SWP_USED flag after all resources are freed so * alloc_swap_info can reuse this si safely. @@ -3578,7 +3567,6 @@ bad_swap: spin_lock(&swap_lock); si->flags = 0; spin_unlock(&swap_lock); - kvfree(zeromap); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) From 0c7e6014b725cc9835b013d92b573a4d06a87f1f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:29 +0800 Subject: [PATCH 045/369] mm, swap: consolidate bad slots setup and make it more robust In preparation for using the swap table to track bad slots directly, move the bad slot setup to one place, set up the swap_map mark, and cluster counter update together. While at it, provide more informative logs and a more robust fallback if any bad slot info looks incorrect. Fixes a potential issue that a malformed swap file may cause the cluster to be unusable upon swapon, and provides a more verbose warning on a malformed swap file Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-4-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 68 ++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 650eae8545f1..f195e90c6e8b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -743,13 +743,37 @@ static void relocate_cluster(struct swap_info_struct *si, * slot. The cluster will not be added to the free cluster list, and its * usage counter will be increased by 1. Only used for initialization. */ -static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, - unsigned long offset) +static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, + struct swap_cluster_info *cluster_info, + unsigned int offset, bool mask) { unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_table *table; struct swap_cluster_info *ci; + /* si->max may got shrunk by swap swap_activate() */ + if (offset >= si->max && !mask) { + pr_debug("Ignoring bad slot %u (max: %u)\n", offset, si->max); + return 0; + } + /* + * Account it, skip header slot: si->pages is initiated as + * si->max - 1. Also skip the masking of last cluster, + * si->pages doesn't include that part. + */ + if (offset && !mask) + si->pages -= 1; + if (!si->pages) { + pr_warn("Empty swap-file\n"); + return -EINVAL; + } + /* Check for duplicated bad swap slots. */ + if (si->swap_map[offset]) { + pr_warn("Duplicated bad slot offset %d\n", offset); + return -EINVAL; + } + + si->swap_map[offset] = SWAP_MAP_BAD; ci = cluster_info + idx; if (!ci->table) { table = swap_table_alloc(GFP_KERNEL); @@ -3220,30 +3244,12 @@ static int setup_swap_map(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { - unsigned long i; unsigned char *swap_map; swap_map = vzalloc(maxpages); si->swap_map = swap_map; if (!swap_map) return -ENOMEM; - - swap_map[0] = SWAP_MAP_BAD; /* omit header page */ - for (i = 0; i < swap_header->info.nr_badpages; i++) { - unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr == 0 || page_nr > swap_header->info.last_page) - return -EINVAL; - if (page_nr < maxpages) { - swap_map[page_nr] = SWAP_MAP_BAD; - si->pages--; - } - } - - if (!si->pages) { - pr_warn("Empty swap-file\n"); - return -EINVAL; - } - return 0; } @@ -3273,26 +3279,28 @@ static int setup_swap_clusters_info(struct swap_info_struct *si, } /* - * Mark unusable pages as unavailable. The clusters aren't - * marked free yet, so no list operations are involved yet. - * - * See setup_swap_map(): header page, bad pages, - * and the EOF part of the last cluster. + * Mark unusable pages (header page, bad pages, and the EOF part of + * the last cluster) as unavailable. The clusters aren't marked free + * yet, so no list operations are involved yet. */ - err = swap_cluster_setup_bad_slot(cluster_info, 0); + err = swap_cluster_setup_bad_slot(si, cluster_info, 0, false); if (err) goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr >= maxpages) - continue; - err = swap_cluster_setup_bad_slot(cluster_info, page_nr); + if (!page_nr || page_nr > swap_header->info.last_page) { + pr_warn("Bad slot offset is out of border: %d (last_page: %d)\n", + page_nr, swap_header->info.last_page); + err = -EINVAL; + goto err; + } + err = swap_cluster_setup_bad_slot(si, cluster_info, page_nr, false); if (err) goto err; } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { - err = swap_cluster_setup_bad_slot(cluster_info, i); + err = swap_cluster_setup_bad_slot(si, cluster_info, i, true); if (err) goto err; } From f3d652b0604375f66f631c467f8e8e2b1c16df78 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:30 +0800 Subject: [PATCH 046/369] mm/workingset: leave highest bits empty for anon shadow Swap table entry will need 4 bits reserved for swap count in the shadow, so the anon shadow should have its leading 4 bits remain 0. This should be OK for the foreseeable future. Take 52 bits of physical address space as an example: for 4K pages, there would be at most 40 bits for addressable pages. Currently, we have 36 bits available (64 - 1 - 16 - 10 - 1, where XA_VALUE takes 1 bit for marker, MEM_CGROUP_ID_SHIFT takes 16 bits, NODES_SHIFT takes <=10 bits, WORKINGSET flags takes 1 bit). So in the worst case, we previously need to pack the 40 bits of address in 36 bits fields using a 64K bucket (bucket_order = 4). After this, the bucket will be increased to 1M. Which should be fine, as on such large machines, the working set size will be way larger than the bucket size. And for MGLRU's gen number tracking, it should be even more than enough, MGLRU's gen number (max_seq) increment is much slower compared to the eviction counter (nonresident_age). And after all, either the refault distance or the gen distance is only a hint that can tolerate inaccuracy just fine. And the 4 bits can be shrunk to 3, or extended to a higher value if needed later. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-5-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swap_table.h | 4 ++++ mm/workingset.c | 49 ++++++++++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/mm/swap_table.h b/mm/swap_table.h index ea244a57a5b7..10e11d1f3b04 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -12,6 +12,7 @@ struct swap_table { }; #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) +#define SWP_TB_COUNT_BITS 4 /* * A swap table entry represents the status of a swap slot on a swap @@ -22,6 +23,9 @@ struct swap_table { * (shadow), or NULL. */ +/* Macro for shadow offset calculation */ +#define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS + /* * Helpers for casting one type of info into a swap table entry. */ diff --git a/mm/workingset.c b/mm/workingset.c index 13422d304715..37a94979900f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,6 +16,7 @@ #include #include #include +#include "swap_table.h" #include "internal.h" /* @@ -184,7 +185,9 @@ #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) +#define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) +#define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON) /* * Eviction timestamps need to be able to cover the full range of @@ -194,12 +197,12 @@ * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ -static unsigned int bucket_order __read_mostly; +static unsigned int bucket_order[ANON_AND_FILE] __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, - bool workingset) + bool workingset, bool file) { - eviction &= EVICTION_MASK; + eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; @@ -244,7 +247,8 @@ static void *lru_gen_eviction(struct folio *folio) struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); - BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > + BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; @@ -254,7 +258,7 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset); + return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type); } /* @@ -262,7 +266,7 @@ static void *lru_gen_eviction(struct folio *folio) * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, - unsigned long *token, bool *workingset) + unsigned long *token, bool *workingset, bool file) { int memcg_id; unsigned long max_seq; @@ -275,7 +279,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); - max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH; return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } @@ -293,7 +297,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) rcu_read_lock(); - recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type); if (lruvec != folio_lruvec(folio)) goto unlock; @@ -331,7 +335,7 @@ static void *lru_gen_eviction(struct folio *folio) } static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, - unsigned long *token, bool *workingset) + unsigned long *token, bool *workingset, bool file) { return false; } @@ -381,6 +385,7 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); + int file = folio_is_file_lru(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; @@ -397,10 +402,10 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); - eviction >>= bucket_order; + eviction >>= bucket_order[file]; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, - folio_test_workingset(folio)); + folio_test_workingset(folio), file); } /** @@ -431,14 +436,15 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool recent; rcu_read_lock(); - recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, + workingset, file); rcu_read_unlock(); return recent; } rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); - eviction <<= bucket_order; + eviction <<= bucket_order[file]; /* * Look up the memcg associated with the stored ID. It might @@ -495,7 +501,8 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ - refault_distance = (refault - eviction) & EVICTION_MASK; + refault_distance = ((refault - eviction) & + (file ? EVICTION_MASK : EVICTION_MASK_ANON)); /* * Compare the distance to the existing workingset size. We @@ -780,8 +787,8 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + unsigned int timestamp_bits, timestamp_bits_anon; struct shrinker *workingset_shadow_shrinker; - unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; @@ -794,11 +801,15 @@ static int __init workingset_init(void) * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON; max_order = fls_long(totalram_pages() - 1); - if (max_order > timestamp_bits) - bucket_order = max_order - timestamp_bits; - pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", - timestamp_bits, max_order, bucket_order); + if (max_order > (BITS_PER_LONG - EVICTION_SHIFT)) + bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits; + if (max_order > timestamp_bits_anon) + bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon; + pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n", + timestamp_bits, timestamp_bits_anon, max_order, + bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, From 62629ae49baa70362125a02a488d885b3c17eab7 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:31 +0800 Subject: [PATCH 047/369] mm, swap: implement helpers for reserving data in the swap table To prepare for using the swap table as the unified swap layer, introduce macros and helpers for storing multiple kinds of data in a swap table entry. From now on, we are storing PFN in the swap table to make space for extra counting bits (SWAP_COUNT). Shadows are still stored as they are, as the SWAP_COUNT is not used yet. Also, rename shadow_swp_to_tb to shadow_to_swp_tb. That's a spelling error, not really worth a separate fix. No behaviour change yet, just prepare the API. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-6-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swap_state.c | 6 +-- mm/swap_table.h | 133 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 125 insertions(+), 14 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d0eef7470be..e213ee35c1d2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -148,7 +148,7 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci, VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - new_tb = folio_to_swp_tb(folio); + new_tb = folio_to_swp_tb(folio, 0); ci_start = swp_cluster_offset(entry); ci_off = ci_start; ci_end = ci_start + nr_pages; @@ -249,7 +249,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); si = __swap_entry_to_info(entry); - new_tb = shadow_swp_to_tb(shadow); + new_tb = shadow_to_swp_tb(shadow, 0); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; @@ -331,7 +331,7 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ - new_tb = folio_to_swp_tb(new); + new_tb = folio_to_swp_tb(new, 0); do { old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); diff --git a/mm/swap_table.h b/mm/swap_table.h index 10e11d1f3b04..10762ac5f4f5 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -12,17 +12,72 @@ struct swap_table { }; #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) -#define SWP_TB_COUNT_BITS 4 /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a * 1:1 map of the swap slots in this cluster. * - * Each swap table entry could be a pointer (folio), a XA_VALUE - * (shadow), or NULL. + * Swap table entry type and bits layouts: + * + * NULL: |---------------- 0 ---------------| - Free slot + * Shadow: | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot + * PFN: | SWAP_COUNT |------ PFN -------|10| - Cached slot + * Pointer: |----------- Pointer ----------|100| - (Unused) + * Bad: |------------- 1 -------------|1000| - Bad slot + * + * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long. + * + * Usages: + * + * - NULL: Swap slot is unused, could be allocated. + * + * - Shadow: Swap slot is used and not cached (usually swapped out). It reuses + * the XA_VALUE format to be compatible with working set shadows. SHADOW_VAL + * part might be all 0 if the working shadow info is absent. In such a case, + * we still want to keep the shadow format as a placeholder. + * + * Memcg ID is embedded in SHADOW_VAL. + * + * - PFN: Swap slot is in use, and cached. Memcg info is recorded on the page + * struct. + * + * - Pointer: Unused yet. `0b100` is reserved for potential pointer usage + * because only the lower three bits can be used as a marker for 8 bytes + * aligned pointers. + * + * - Bad: Swap slot is reserved, protects swap header or holes on swap devices. */ +#if defined(MAX_POSSIBLE_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) +#elif defined(MAX_PHYSMEM_BITS) +#define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#else +#define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) +#endif + +/* NULL Entry, all 0 */ +#define SWP_TB_NULL 0UL + +/* Swapped out: shadow */ +#define SWP_TB_SHADOW_MARK 0b1UL + +/* Cached: PFN */ +#define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWP_TB_PFN_MARK_BITS) +#define SWP_TB_PFN_MARK 0b10UL +#define SWP_TB_PFN_MARK_BITS 2 +#define SWP_TB_PFN_MARK_MASK (BIT(SWP_TB_PFN_MARK_BITS) - 1) + +/* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended */ +#define SWP_TB_COUNT_BITS min(4, BITS_PER_LONG - SWP_TB_PFN_BITS) +#define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS)) +#define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS) +#define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1) + +/* Bad slot: ends with 0b1000 and rests of bits are all 1 */ +#define SWP_TB_BAD ((~0UL) << 3) + /* Macro for shadow offset calculation */ #define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS @@ -35,18 +90,47 @@ static inline unsigned long null_to_swp_tb(void) return 0; } -static inline unsigned long folio_to_swp_tb(struct folio *folio) +static inline unsigned long __count_to_swp_tb(unsigned char count) { - BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); - return (unsigned long)folio; + /* + * At least three values are needed to distinguish free (0), + * used (count > 0 && count < SWP_TB_COUNT_MAX), and + * overflow (count == SWP_TB_COUNT_MAX). + */ + BUILD_BUG_ON(SWP_TB_COUNT_MAX < 2 || SWP_TB_COUNT_BITS < 2); + VM_WARN_ON(count > SWP_TB_COUNT_MAX); + return ((unsigned long)count) << SWP_TB_COUNT_SHIFT; } -static inline unsigned long shadow_swp_to_tb(void *shadow) +static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int count) +{ + unsigned long swp_tb; + + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + BUILD_BUG_ON(SWAP_CACHE_PFN_BITS > + (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS)); + + swp_tb = (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK; + VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK); + + return swp_tb | __count_to_swp_tb(count); +} + +static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned int count) +{ + return pfn_to_swp_tb(folio_pfn(folio), count); +} + +static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int count) { BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != BITS_PER_BYTE * sizeof(unsigned long)); + BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK); + VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); - return (unsigned long)shadow; + VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK)); + + return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_MARK; } /* @@ -59,7 +143,7 @@ static inline bool swp_tb_is_null(unsigned long swp_tb) static inline bool swp_tb_is_folio(unsigned long swp_tb) { - return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); + return ((swp_tb & SWP_TB_PFN_MARK_MASK) == SWP_TB_PFN_MARK); } static inline bool swp_tb_is_shadow(unsigned long swp_tb) @@ -67,19 +151,44 @@ static inline bool swp_tb_is_shadow(unsigned long swp_tb) return xa_is_value((void *)swp_tb); } +static inline bool swp_tb_is_bad(unsigned long swp_tb) +{ + return swp_tb == SWP_TB_BAD; +} + +static inline bool swp_tb_is_countable(unsigned long swp_tb) +{ + return (swp_tb_is_shadow(swp_tb) || swp_tb_is_folio(swp_tb) || + swp_tb_is_null(swp_tb)); +} + /* * Helpers for retrieving info from swap table. */ static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_folio(swp_tb)); - return (void *)swp_tb; + return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS); } static inline void *swp_tb_to_shadow(unsigned long swp_tb) { VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); - return (void *)swp_tb; + /* No shift needed, xa_value is stored as it is in the lower bits. */ + return (void *)(swp_tb & ~SWP_TB_COUNT_MASK); +} + +static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); + return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT); +} + +static inline int swp_tb_get_count(unsigned long swp_tb) +{ + if (swp_tb_is_countable(swp_tb)) + return __swp_tb_get_count(swp_tb); + return -EINVAL; } /* @@ -124,6 +233,8 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci, atomic_long_t *table; unsigned long swp_tb; + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + rcu_read_lock(); table = rcu_dereference(ci->table); swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); From 1307442b935bcac127dafb7fc54545e7e70aed96 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:32 +0800 Subject: [PATCH 048/369] mm, swap: mark bad slots in swap table directly In preparing the deprecating swap_map, mark bad slots in the swap table too when setting SWAP_MAP_BAD in swap_map. Also, refine the swap table sanity check on freeing to adapt to the bad slots change. For swapoff, the bad slots count must match the cluster usage count, as nothing should touch them, and they contribute to the cluster usage count on swapon. For ordinary swap table freeing, the swap table of clusters with bad slots should never be freed since the cluster usage count never reaches zero. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-7-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 56 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f195e90c6e8b..4ab970d97959 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -454,16 +454,37 @@ static void swap_table_free(struct swap_table *table) swap_table_free_folio_rcu_cb); } +/* + * Sanity check to ensure nothing leaked, and the specified range is empty. + * One special case is that bad slots can't be freed, so check the number of + * bad slots for swapoff, and non-swapoff path must never free bad slots. + */ +static void swap_cluster_assert_empty(struct swap_cluster_info *ci, bool swapoff) +{ + unsigned int ci_off = 0, ci_end = SWAPFILE_CLUSTER; + unsigned long swp_tb; + int bad_slots = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_VM) && !swapoff) + return; + + do { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_bad(swp_tb)) + bad_slots++; + else + WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + + WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); +} + static void swap_cluster_free_table(struct swap_cluster_info *ci) { - unsigned int ci_off; struct swap_table *table; /* Only empty cluster's table is allow to be freed */ lockdep_assert_held(&ci->lock); - VM_WARN_ON_ONCE(!cluster_is_empty(ci)); - for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) - VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); table = (void *)rcu_dereference_protected(ci->table, true); rcu_assign_pointer(ci->table, NULL); @@ -567,6 +588,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { + swap_cluster_assert_empty(ci, false); swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; @@ -747,9 +769,11 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned int offset, bool mask) { + unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned long idx = offset / SWAPFILE_CLUSTER; - struct swap_table *table; struct swap_cluster_info *ci; + struct swap_table *table; + int ret = 0; /* si->max may got shrunk by swap swap_activate() */ if (offset >= si->max && !mask) { @@ -767,13 +791,7 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, pr_warn("Empty swap-file\n"); return -EINVAL; } - /* Check for duplicated bad swap slots. */ - if (si->swap_map[offset]) { - pr_warn("Duplicated bad slot offset %d\n", offset); - return -EINVAL; - } - si->swap_map[offset] = SWAP_MAP_BAD; ci = cluster_info + idx; if (!ci->table) { table = swap_table_alloc(GFP_KERNEL); @@ -781,13 +799,21 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, return -ENOMEM; rcu_assign_pointer(ci->table, table); } - - ci->count++; + spin_lock(&ci->lock); + /* Check for duplicated bad swap slots. */ + if (__swap_table_xchg(ci, ci_off, SWP_TB_BAD) != SWP_TB_NULL) { + pr_warn("Duplicated bad slot offset %d\n", offset); + ret = -EINVAL; + } else { + si->swap_map[offset] = SWAP_MAP_BAD; + ci->count++; + } + spin_unlock(&ci->lock); WARN_ON(ci->count > SWAPFILE_CLUSTER); WARN_ON(ci->flags); - return 0; + return ret; } /* @@ -2754,7 +2780,7 @@ static void free_swap_cluster_info(struct swap_cluster_info *cluster_info, /* Cluster with bad marks count will have a remaining table */ spin_lock(&ci->lock); if (rcu_dereference_protected(ci->table, true)) { - ci->count = 0; + swap_cluster_assert_empty(ci, true); swap_cluster_free_table(ci); } spin_unlock(&ci->lock); From 5dc533f7aa504d85d63b6f2e83ff21f411ea04b8 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:33 +0800 Subject: [PATCH 049/369] mm, swap: simplify swap table sanity range check The newly introduced helper, which checks bad slots and emptiness of a cluster, can cover the older sanity check just fine, with a more rigorous condition check. So merge them. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-8-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4ab970d97959..54a19ebce540 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -459,9 +459,11 @@ static void swap_table_free(struct swap_table *table) * One special case is that bad slots can't be freed, so check the number of * bad slots for swapoff, and non-swapoff path must never free bad slots. */ -static void swap_cluster_assert_empty(struct swap_cluster_info *ci, bool swapoff) +static void swap_cluster_assert_empty(struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr, + bool swapoff) { - unsigned int ci_off = 0, ci_end = SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr; unsigned long swp_tb; int bad_slots = 0; @@ -588,7 +590,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - swap_cluster_assert_empty(ci, false); + swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, false); swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; @@ -898,26 +900,6 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } -/* - * Currently, the swap table is not used for count tracking, just - * do a sanity check here to ensure nothing leaked, so the swap - * table should be empty upon freeing. - */ -static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, - unsigned int start, unsigned int nr) -{ - unsigned int ci_off = start % SWAPFILE_CLUSTER; - unsigned int ci_end = ci_off + nr; - unsigned long swp_tb; - - if (IS_ENABLED(CONFIG_DEBUG_VM)) { - do { - swp_tb = __swap_table_get(ci, ci_off); - VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); - } while (++ci_off < ci_end); - } -} - static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, struct folio *folio, @@ -943,13 +925,14 @@ static bool cluster_alloc_range(struct swap_info_struct *si, if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; + swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; nr_pages = 1; WARN_ON_ONCE(si->swap_map[offset]); si->swap_map[offset] = 1; - swap_cluster_assert_table_empty(ci, offset, 1); + swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -1768,7 +1751,7 @@ void swap_entries_free(struct swap_info_struct *si, mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); - swap_cluster_assert_table_empty(ci, offset, nr_pages); + swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); if (!ci->count) free_cluster(si, ci); @@ -2780,7 +2763,7 @@ static void free_swap_cluster_info(struct swap_cluster_info *cluster_info, /* Cluster with bad marks count will have a remaining table */ spin_lock(&ci->lock); if (rcu_dereference_protected(ci->table, true)) { - swap_cluster_assert_empty(ci, true); + swap_cluster_assert_empty(ci, 0, SWAPFILE_CLUSTER, true); swap_cluster_free_table(ci); } spin_unlock(&ci->lock); From 0d6af9bcf383bcdf601e670bb605861b01e318e7 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:34 +0800 Subject: [PATCH 050/369] mm, swap: use the swap table to track the swap count Now all the infrastructures are ready, switch to using the swap table only. This is unfortunately a large patch because the whole old counting mechanism, especially SWP_CONTINUED, has to be gone and switch to the new mechanism together, with no intermediate steps available. The swap table is capable of holding up to SWP_TB_COUNT_MAX - 1 counts in the higher bits of each table entry, so using that, the swap_map can be completely dropped. swap_map also had a limit of SWAP_CONT_MAX. Any value beyond that limit will require a COUNT_CONTINUED page. COUNT_CONTINUED is a bit complex to maintain, so for the swap table, a simpler approach is used: when the count goes beyond SWP_TB_COUNT_MAX - 1, the cluster will have an extend_table allocated, which is a swap cluster-sized array of unsigned int. The counting is basically offloaded there until the count drops below SWP_TB_COUNT_MAX again. Both the swap table and the extend table are cluster-based, so they exhibit good performance and sparsity. To make the switch from swap_map to swap table clean, this commit cleans up and introduces a new set of functions based on the swap table design, for manipulating swap counts: - __swap_cluster_dup_entry, __swap_cluster_put_entry, __swap_cluster_alloc_entry, __swap_cluster_free_entry: Increase/decrease the count of a swap slot, or alloc / free a swap slot. This is the internal routine that does the counting work based on the swap table and handles all the complexities. The caller will need to lock the cluster before calling them. All swap count-related update operations are wrapped by these four helpers. - swap_dup_entries_cluster, swap_put_entries_cluster: Increase/decrease the swap count of one or a set of swap slots in the same cluster range. These two helpers serve as the common routines for folio_dup_swap & swap_dup_entry_direct, or folio_put_swap & swap_put_entries_direct. And use these helpers to replace all existing callers. This helps to simplify the count tracking by a lot, and the swap_map is gone. [ryncsn@gmail.com: fix build] Link: https://lkml.kernel.org/r/aZWuLZi-vYi3vAWe@KASONG-MC4 Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-9-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Suggested-by: Chris Li Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/swap.h | 28 +- mm/memory.c | 2 +- mm/swap.h | 14 +- mm/swap_state.c | 53 ++- mm/swap_table.h | 5 + mm/swapfile.c | 794 ++++++++++++++++--------------------------- 6 files changed, 336 insertions(+), 560 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 62fc7499b408..0effe3cc50f5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,7 +208,6 @@ enum { SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ - SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ @@ -223,16 +222,6 @@ enum { #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -/* Bit flag in swap_map */ -#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ - -/* Special value in first swap_map */ -#define SWAP_MAP_MAX 0x3e /* Max count */ -#define SWAP_MAP_BAD 0x3f /* Note page is bad */ - -/* Special value in each swap_map continuation */ -#define SWAP_CONT_MAX 0x7f /* Max count */ - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the @@ -264,8 +253,7 @@ struct swap_info_struct { signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ - unsigned int max; /* extent of the swap_map */ - unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned int max; /* size of this swap device */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ @@ -284,18 +272,14 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, inuse_pages and all cluster - * lists. other fields are only changed + * inuse_pages and all cluster lists. + * Other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ - spinlock_t cont_lock; /* - * protect swap count continuation page - * list. - */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ @@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern int add_swap_count_continuation(swp_entry_t, gfp_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio) { } -static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) -{ - return 0; -} - static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; diff --git a/mm/memory.c b/mm/memory.c index 2f815a34d924..7084c426f933 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1346,7 +1346,7 @@ again: if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } diff --git a/mm/swap.h b/mm/swap.h index bfafa637c458..0a91e21e92b1 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -37,6 +37,7 @@ struct swap_cluster_info { u8 flags; u8 order; atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ + unsigned int *extend_table; /* For large swap count, protected by ci->lock */ struct list_head list; }; @@ -183,6 +184,8 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp); + /* * Below are the core routines for doing swap for a folio. * All helpers requires the folio to be locked, and a locked folio @@ -206,9 +209,9 @@ int folio_dup_swap(struct folio *folio, struct page *subpage); void folio_put_swap(struct folio *folio, struct page *subpage); /* For internal use */ -extern void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages); +extern void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr_pages); /* linux/mm/page_io.c */ int sio_pool_init(void); @@ -446,6 +449,11 @@ static inline int swap_writeout(struct folio *folio, return 0; } +static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + return -EINVAL; +} + static inline bool swap_cache_has_folio(swp_entry_t entry) { return false; diff --git a/mm/swap_state.c b/mm/swap_state.c index e213ee35c1d2..e7618ffe6d70 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -140,21 +140,20 @@ void *swap_cache_get_shadow(swp_entry_t entry) void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry) { - unsigned long new_tb; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long nr_pages = folio_nr_pages(folio); + unsigned long pfn = folio_pfn(folio); + unsigned long old_tb; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - new_tb = folio_to_swp_tb(folio, 0); - ci_start = swp_cluster_offset(entry); - ci_off = ci_start; - ci_end = ci_start + nr_pages; + ci_end = ci_off + nr_pages; do { - VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); - __swap_table_set(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); @@ -183,14 +182,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, unsigned long old_tb; struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end, offset; + unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - offset = swp_offset(entry); ci = swap_cluster_lock(si, swp_offset(entry)); if (unlikely(!ci->table)) { err = -ENOENT; @@ -202,13 +200,12 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, err = -EEXIST; goto failed; } - if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + if (unlikely(!__swp_tb_get_count(old_tb))) { err = -ENOENT; goto failed; } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); - offset++; } while (++ci_off < ci_end); __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); @@ -237,8 +234,9 @@ failed: void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + int count; + unsigned long old_tb; struct swap_info_struct *si; - unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); @@ -249,20 +247,20 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); si = __swap_entry_to_info(entry); - new_tb = shadow_to_swp_tb(shadow, 0); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; do { - /* If shadow is NULL, we sets an empty shadow */ - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); - if (__swap_count(swp_entry(si->type, - swp_offset(entry) + ci_off - ci_start))) + count = __swp_tb_get_count(old_tb); + if (count) folio_swapped = true; else need_free = true; + /* If shadow is NULL, we sets an empty shadow. */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); } while (++ci_off < ci_end); folio->swap.val = 0; @@ -271,13 +269,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); if (!folio_swapped) { - swap_entries_free(si, ci, swp_offset(entry), nr_pages); + __swap_cluster_free_entries(si, ci, ci_start, nr_pages); } else if (need_free) { + ci_off = ci_start; do { - if (!__swap_count(entry)) - swap_entries_free(si, ci, swp_offset(entry), 1); - entry.val++; - } while (--nr_pages); + if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) + __swap_cluster_free_entries(si, ci, ci_off, 1); + } while (++ci_off < ci_end); } } @@ -324,17 +322,18 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, unsigned long nr_pages = folio_nr_pages(new); unsigned int ci_off = swp_cluster_offset(entry); unsigned int ci_end = ci_off + nr_pages; - unsigned long old_tb, new_tb; + unsigned long pfn = folio_pfn(new); + unsigned long old_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ - new_tb = folio_to_swp_tb(new, 0); do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); /* @@ -368,7 +367,7 @@ void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) ci_end = ci_off + nr_ents; do { old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); - WARN_ON_ONCE(swp_tb_is_folio(old)); + WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old)); } while (++ci_off < ci_end); } diff --git a/mm/swap_table.h b/mm/swap_table.h index 10762ac5f4f5..8415ffbe2b9c 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -191,6 +191,11 @@ static inline int swp_tb_get_count(unsigned long swp_tb) return -EINVAL; } +static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count) +{ + return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count)); +} + /* * Helpers for accessing or modifying the swap table of a cluster, * the swap cluster must be locked. diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a19ebce540..cf976ecae8a8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,15 +51,8 @@ #include "swap_table.h" #include "swap.h" -static bool swap_count_continued(struct swap_info_struct *, pgoff_t, - unsigned char); -static void free_swap_count_continuations(struct swap_info_struct *); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -182,22 +175,19 @@ static long swap_usage_in_pages(struct swap_info_struct *si) /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 -static bool swap_only_has_cache(struct swap_info_struct *si, - struct swap_cluster_info *ci, +static bool swap_only_has_cache(struct swap_cluster_info *ci, unsigned long offset, int nr_pages) { unsigned int ci_off = offset % SWAPFILE_CLUSTER; - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; do { swp_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); - if (*map) + if (swp_tb_get_count(swp_tb)) return false; - ++ci_off; - } while (++map < map_end); + } while (++ci_off < ci_end); return true; } @@ -256,7 +246,7 @@ again: * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); + need_reclaim = swap_only_has_cache(ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -479,6 +469,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci, } while (++ci_off < ci_end); WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); + WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); } static void swap_cluster_free_table(struct swap_cluster_info *ci) @@ -807,7 +798,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, pr_warn("Duplicated bad slot offset %d\n", offset); ret = -EINVAL; } else { - si->swap_map[offset] = SWAP_MAP_BAD; ci->count++; } spin_unlock(&ci->lock); @@ -829,18 +819,16 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, { unsigned int nr_pages = 1 << order; unsigned long offset = start, end = start + nr_pages; - unsigned char *map = si->swap_map; unsigned long swp_tb; spin_unlock(&ci->lock); do { - if (READ_ONCE(map[offset])) - break; swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + if (swp_tb_get_count(swp_tb)) + break; + if (swp_tb_is_folio(swp_tb)) if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) break; - } } while (++offset < end); spin_lock(&ci->lock); @@ -864,7 +852,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, */ for (offset = start; offset < end; offset++) { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (map[offset] || !swp_tb_is_null(swp_tb)) + if (!swp_tb_is_null(swp_tb)) return false; } @@ -876,37 +864,35 @@ static bool cluster_scan_range(struct swap_info_struct *si, unsigned long offset, unsigned int nr_pages, bool *need_reclaim) { - unsigned long end = offset + nr_pages; - unsigned char *map = si->swap_map; + unsigned int ci_off = offset % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; - if (cluster_is_empty(ci)) - return true; - do { - if (map[offset]) - return false; - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_null(swp_tb)) + continue; + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { if (!vm_swap_full()) return false; *need_reclaim = true; - } else { - /* A entry with no count and no cache must be null */ - VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + continue; } - } while (++offset < end); + /* Slot with zero count can only be NULL or folio */ + VM_WARN_ON(!swp_tb_get_count(swp_tb)); + return false; + } while (++ci_off < ci_end); return true; } -static bool cluster_alloc_range(struct swap_info_struct *si, - struct swap_cluster_info *ci, - struct folio *folio, - unsigned int offset) +static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + struct folio *folio, + unsigned int ci_off) { - unsigned long nr_pages; unsigned int order; + unsigned long nr_pages; lockdep_assert_held(&ci->lock); @@ -925,14 +911,15 @@ static bool cluster_alloc_range(struct swap_info_struct *si, if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); - __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); + swap_cluster_assert_empty(ci, ci_off, nr_pages, false); + __swap_cache_add_folio(ci, folio, swp_entry(si->type, + ci_off + cluster_offset(si, ci))); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; nr_pages = 1; - WARN_ON_ONCE(si->swap_map[offset]); - si->swap_map[offset] = 1; - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false); + swap_cluster_assert_empty(ci, ci_off, 1, false); + /* Sets a fake shadow as placeholder */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -983,7 +970,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!ret) continue; } - if (!cluster_alloc_range(si, ci, folio, offset)) + if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER)) break; found = offset; offset += nr_pages; @@ -1030,7 +1017,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; + unsigned long swp_tb; int nr_reclaim; if (force) @@ -1042,8 +1029,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) to_scan--; while (offset < end) { - if (!READ_ONCE(map[offset]) && - swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1452,40 +1439,127 @@ start_over: return false; } +static int swap_extend_table_alloc(struct swap_info_struct *si, + struct swap_cluster_info *ci, gfp_t gfp) +{ + void *table; + + table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); + if (!table) + return -ENOMEM; + + spin_lock(&ci->lock); + if (!ci->extend_table) + ci->extend_table = table; + else + kfree(table); + spin_unlock(&ci->lock); + return 0; +} + +int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + int ret; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = get_swap_device(entry); + if (!si) + return 0; + + ci = __swap_offset_to_cluster(si, offset); + ret = swap_extend_table_alloc(si, ci, gfp); + + put_swap_device(si); + return ret; +} + +static void swap_extend_table_try_free(struct swap_cluster_info *ci) +{ + unsigned long i; + bool can_free = true; + + if (!ci->extend_table) + return; + + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (ci->extend_table[i]) + can_free = false; + } + + if (can_free) { + kfree(ci->extend_table); + ci->extend_table = NULL; + } +} + +/* Decrease the swap count of one slot, without freeing it */ +static void __swap_cluster_put_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + count = __swp_tb_get_count(swp_tb); + + VM_WARN_ON_ONCE(count <= 0); + VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX); + + if (count == SWP_TB_COUNT_MAX) { + count = ci->extend_table[ci_off]; + /* Overflow starts with SWP_TB_COUNT_MAX */ + VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX); + count--; + if (count == (SWP_TB_COUNT_MAX - 1)) { + ci->extend_table[ci_off] = 0; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); + swap_extend_table_try_free(ci); + } else { + ci->extend_table[ci_off] = count; + } + } else { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); + } +} + /** - * swap_put_entries_cluster - Decrease the swap count of a set of slots. + * swap_put_entries_cluster - Decrease the swap count of slots within one cluster * @si: The swap device. - * @start: start offset of slots. + * @offset: start offset of slots. * @nr: number of slots. - * @reclaim_cache: if true, also reclaim the swap cache. + * @reclaim_cache: if true, also reclaim the swap cache if slots are freed. * * This helper decreases the swap count of a set of slots and tries to * batch free them. Also reclaims the swap cache if @reclaim_cache is true. - * Context: The caller must ensure that all slots belong to the same - * cluster and their swap count doesn't go underflow. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. */ static void swap_put_entries_cluster(struct swap_info_struct *si, - unsigned long start, int nr, + pgoff_t offset, int nr, bool reclaim_cache) { - unsigned long offset = start, end = start + nr; - unsigned long batch_start = SWAP_ENTRY_INVALID; struct swap_cluster_info *ci; + unsigned int ci_off, ci_end; + pgoff_t end = offset + nr; bool need_reclaim = false; unsigned int nr_reclaimed; unsigned long swp_tb; - unsigned int count; + int ci_batch = -1; ci = swap_cluster_lock(si, offset); + ci_off = offset % SWAPFILE_CLUSTER; + ci_end = ci_off + nr; do { - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - count = si->swap_map[offset]; - VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); - if (count == 1) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_get_count(swp_tb) == 1) { /* count == 1 and non-cached slots will be batch freed. */ if (!swp_tb_is_folio(swp_tb)) { - if (!batch_start) - batch_start = offset; + if (ci_batch == -1) + ci_batch = ci_off; continue; } /* count will be 0 after put, slot can be reclaimed */ @@ -1497,21 +1571,20 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, * slots will be freed when folio is removed from swap cache * (__swap_cache_del_folio). */ - swap_put_entry_locked(si, ci, offset); - if (batch_start) { - swap_entries_free(si, ci, batch_start, offset - batch_start); - batch_start = SWAP_ENTRY_INVALID; + __swap_cluster_put_entry(ci, ci_off); + if (ci_batch != -1) { + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); + ci_batch = -1; } - } while (++offset < end); + } while (++ci_off < ci_end); - if (batch_start) - swap_entries_free(si, ci, batch_start, offset - batch_start); + if (ci_batch != -1) + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); swap_cluster_unlock(ci); if (!need_reclaim || !reclaim_cache) return; - offset = start; do { nr_reclaimed = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); @@ -1521,6 +1594,92 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, } while (offset < end); } +/* Increase the swap count of one slot. */ +static int __swap_cluster_dup_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + /* Bad or special slots can't be handled */ + if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb))) + return -EINVAL; + count = __swp_tb_get_count(swp_tb); + /* Must be either cached or have a count already */ + if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb))) + return -ENOENT; + + if (likely(count < (SWP_TB_COUNT_MAX - 1))) { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1)); + VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]); + } else if (count == (SWP_TB_COUNT_MAX - 1)) { + if (ci->extend_table) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off]); + ci->extend_table[ci_off] = SWP_TB_COUNT_MAX; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX)); + } else { + return -ENOMEM; + } + } else if (count == SWP_TB_COUNT_MAX) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off] >= + type_max(typeof(ci->extend_table[0]))); + ++ci->extend_table[ci_off]; + } else { + /* Never happens unless counting went wrong */ + WARN_ON_ONCE(1); + } + + return 0; +} + +/** + * swap_dup_entries_cluster: Increase the swap count of slots within one cluster. + * @si: The swap device. + * @offset: start offset of slots. + * @nr: number of slots. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. + * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX) + * and failed to allocate an extended table, -EINVAL if any entry is bad entry. + */ +static int swap_dup_entries_cluster(struct swap_info_struct *si, + pgoff_t offset, int nr) +{ + int err; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + + ci_start = offset % SWAPFILE_CLUSTER; + ci_end = ci_start + nr; + ci_off = ci_start; + ci = swap_cluster_lock(si, offset); +restart: + do { + err = __swap_cluster_dup_entry(ci, ci_off); + if (unlikely(err)) { + if (err == -ENOMEM) { + spin_unlock(&ci->lock); + err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); + spin_lock(&ci->lock); + if (!err) + goto restart; + } + goto failed; + } + } while (++ci_off < ci_end); + swap_cluster_unlock(ci); + return 0; +failed: + while (ci_off-- > ci_start) + __swap_cluster_put_entry(ci, ci_off); + swap_extend_table_try_free(ci); + swap_cluster_unlock(ci); + return err; +} + /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap @@ -1589,13 +1748,10 @@ again: * Context: Caller must ensure the folio is locked and in the swap cache. * NOTE: The caller also has to ensure there is no raced call to * swap_put_entries_direct on its swap entry before this helper returns, or - * the swap map may underflow. Currently, we only accept @subpage == NULL - * for shmem due to the limitation of swap continuation: shmem always - * duplicates the swap entry only once, so there is no such issue for it. + * the swap count may underflow. */ int folio_dup_swap(struct folio *folio, struct page *subpage) { - int err = 0; swp_entry_t entry = folio->swap; unsigned long nr_pages = folio_nr_pages(folio); @@ -1607,10 +1763,8 @@ int folio_dup_swap(struct folio *folio, struct page *subpage) nr_pages = 1; } - while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - - return err; + return swap_dup_entries_cluster(swap_entry_to_info(entry), + swp_offset(entry), nr_pages); } /** @@ -1639,28 +1793,6 @@ void folio_put_swap(struct folio *folio, struct page *subpage) swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); } -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset) -{ - unsigned char count; - - count = si->swap_map[offset]; - if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { - if (count == COUNT_CONTINUED) { - if (swap_count_continued(si, offset, count)) - count = SWAP_MAP_MAX | COUNT_CONTINUED; - else - count = SWAP_MAP_MAX; - } else - count--; - } - - WRITE_ONCE(si->swap_map[offset], count); - if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) - swap_entries_free(si, ci, offset, 1); -} - /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU @@ -1727,31 +1859,30 @@ put_out: } /* - * Drop the last ref of swap entries, caller have to ensure all entries - * belong to the same cgroup and cluster. + * Free a set of swap slots after their swap count dropped to zero, or will be + * zero after putting the last ref (saves one __swap_cluster_put_entry call). */ -void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages) +void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_start, unsigned int nr_pages) { - swp_entry_t entry = swp_entry(si->type, offset); - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned long old_tb; + unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; + unsigned long offset = cluster_offset(si, ci) + ci_start; - /* It should never free entries across different clusters */ - VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); - VM_BUG_ON(cluster_is_empty(ci)); - VM_BUG_ON(ci->count < nr_pages); + VM_WARN_ON(ci->count < nr_pages); ci->count -= nr_pages; do { - VM_WARN_ON(*map > 1); - *map = 0; - } while (++map < map_end); + old_tb = __swap_table_get(ci, ci_off); + /* Release the last ref, or after swap cache is dropped */ + VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); + __swap_table_set(ci, ci_off, null_to_swp_tb()); + } while (++ci_off < ci_end); - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); swap_range_free(si, offset, nr_pages); - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); + swap_cluster_assert_empty(ci, ci_start, nr_pages, false); if (!ci->count) free_cluster(si, ci); @@ -1761,10 +1892,10 @@ void swap_entries_free(struct swap_info_struct *si, int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry); - return si->swap_map[offset]; + return swp_tb_get_count(__swap_table_get(ci, ci_off)); } /** @@ -1776,81 +1907,62 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; - int count; + unsigned long swp_tb; ci = swap_cluster_lock(si, offset); - count = si->swap_map[offset]; + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); swap_cluster_unlock(ci); - return count && count != SWAP_MAP_BAD; + return swp_tb_get_count(swp_tb) > 0; } /* * How many references to @entry are currently swapped out? - * This considers COUNT_CONTINUED so it returns exact answer. + * This returns exact answer. */ int swp_swapcount(swp_entry_t entry) { - int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; - struct page *page; - pgoff_t offset; - unsigned char *map; + unsigned long swp_tb; + int count; si = get_swap_device(entry); if (!si) return 0; - offset = swp_offset(entry); - - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - if (!(count & COUNT_CONTINUED)) - goto out; - - count &= ~COUNT_CONTINUED; - n = SWAP_MAP_MAX + 1; - - page = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - VM_BUG_ON(page_private(page) != SWP_CONTINUED); - - do { - page = list_next_entry(page, lru); - map = kmap_local_page(page); - tmp_count = map[offset]; - kunmap_local(map); - - count += (tmp_count & ~COUNT_CONTINUED) * n; - n *= (SWAP_CONT_MAX + 1); - } while (tmp_count & COUNT_CONTINUED); -out: + ci = swap_cluster_lock(si, swp_offset(entry)); + swp_tb = __swap_table_get(ci, swp_cluster_offset(entry)); + count = swp_tb_get_count(swp_tb); + if (count == SWP_TB_COUNT_MAX) + count = ci->extend_table[swp_cluster_offset(entry)]; swap_cluster_unlock(ci); put_swap_device(si); - return count; + + return count < 0 ? 0 : count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry, int order) { struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, nr_pages); + unsigned int ci_off; int i; bool ret = false; ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { - if (map[roffset]) + ci_off = roffset % SWAPFILE_CLUSTER; + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { - if (map[offset + i]) { + ci_off = (offset + i) % SWAPFILE_CLUSTER; + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) { ret = true; break; } @@ -2016,7 +2128,8 @@ void swap_free_hibernation_slot(swp_entry_t entry) return; ci = swap_cluster_lock(si, offset); - swap_put_entry_locked(si, ci, offset); + __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); + __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); swap_cluster_unlock(ci); /* In theory readahead might add it to the swap cache by accident */ @@ -2242,13 +2355,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned int type) { pte_t *pte = NULL; - struct swap_info_struct *si; - si = swap_info[type]; do { struct folio *folio; - unsigned long offset; - unsigned char swp_count; + unsigned long swp_tb; softleaf_t entry; int ret; pte_t ptent; @@ -2267,7 +2377,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (swp_type(entry) != type) continue; - offset = swp_offset(entry); pte_unmap(pte); pte = NULL; @@ -2284,8 +2393,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, &vmf); } if (!folio) { - swp_count = READ_ONCE(si->swap_map[offset]); - if (swp_count == 0 || swp_count == SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_get_count(swp_tb) <= 0) continue; return -ENOMEM; } @@ -2413,7 +2523,7 @@ unlock: } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap table from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ @@ -2422,7 +2532,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int i; unsigned long swp_tb; - unsigned char count; /* * No need for swap_lock here: we're just looking @@ -2431,12 +2540,9 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { - count = READ_ONCE(si->swap_map[i]); swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), i % SWAPFILE_CLUSTER); - if (count == SWAP_MAP_BAD) - continue; - if (count || swp_tb_is_folio(swp_tb)) + if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -2796,7 +2902,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si) SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; - unsigned char *swap_map; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; @@ -2874,8 +2979,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) flush_percpu_swap_cluster(p); destroy_swap_extents(p, p->swap_file); - if (p->flags & SWP_CONTINUED) - free_swap_count_continuations(p); if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); @@ -2887,8 +2990,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - swap_map = p->swap_map; - p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; maxpages = p->max; @@ -2902,7 +3003,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster = NULL; - vfree(swap_map); kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ @@ -3122,7 +3222,6 @@ static struct swap_info_struct *alloc_swap_info(void) kvfree(defer); } spin_lock_init(&p->lock); - spin_lock_init(&p->cont_lock); atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); @@ -3249,19 +3348,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -static int setup_swap_map(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned long maxpages) -{ - unsigned char *swap_map; - - swap_map = vzalloc(maxpages); - si->swap_map = swap_map; - if (!swap_map) - return -ENOMEM; - return 0; -} - static int setup_swap_clusters_info(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) @@ -3446,11 +3532,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) maxpages = si->max; - /* Setup the swap map and apply bad block */ - error = setup_swap_map(si, swap_header, maxpages); - if (error) - goto bad_swap_unlock_inode; - /* Set up the swap cluster info */ error = setup_swap_clusters_info(si, swap_header, maxpages); if (error) @@ -3571,8 +3652,6 @@ bad_swap: inode = NULL; destroy_swap_extents(si, swap_file); swap_cgroup_swapoff(si->type); - vfree(si->swap_map); - si->swap_map = NULL; free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; kvfree(si->zeromap); @@ -3614,67 +3693,20 @@ void si_swapinfo(struct sysinfo *val) } /* - * Verify that nr swap entries are valid and increment their swap map counts. + * swap_dup_entry_direct() - Increase reference count of a swap entry by one. + * @entry: first swap entry from which we want to increase the refcount. * - * Returns error code in following case. - * - success -> 0 - * - swp_entry is invalid -> EINVAL - * - swap-mapped reference is requested but the entry is not used. -> ENOENT - * - swap-mapped reference requested but needs continued swap count. -> ENOMEM + * Returns 0 for success, or -ENOMEM if the extend table is required + * but could not be atomically allocated. Returns -EINVAL if the swap + * entry is invalid, which might occur if a page table entry has got + * corrupted. + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being increased. */ -static int swap_dup_entries(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage, int nr) +int swap_dup_entry_direct(swp_entry_t entry) { - int i; - unsigned char count; - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - /* - * For swapin out, allocator never allocates bad slots. for - * swapin, readahead is guarded by swap_entry_swapped. - */ - if (WARN_ON(count == SWAP_MAP_BAD)) - return -ENOENT; - /* - * Swap count duplication must be guarded by either swap cache folio (from - * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). - */ - if (WARN_ON(!count && - !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) - return -ENOENT; - if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) - return -EINVAL; - } - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) - count += usage; - else if (swap_count_continued(si, offset + i, count)) - count = COUNT_CONTINUED; - else { - /* - * Don't need to rollback changes, because if - * usage == 1, there must be nr == 1. - */ - return -ENOMEM; - } - - WRITE_ONCE(si->swap_map[offset + i], count); - } - - return 0; -} - -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) -{ - int err; struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { @@ -3682,253 +3714,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) return -EINVAL; } - VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - ci = swap_cluster_lock(si, offset); - err = swap_dup_entries(si, ci, offset, usage, nr); - swap_cluster_unlock(ci); - return err; -} - -/* - * swap_dup_entry_direct() - Increase reference count of a swap entry by one. - * @entry: first swap entry from which we want to increase the refcount. - * - * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required - * but could not be atomically allocated. Returns 0, just as if it succeeded, - * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which - * might occur if a page table entry has got corrupted. - * - * Context: Caller must ensure there is no race condition on the reference - * owner. e.g., locking the PTL of a PTE containing the entry being increased. - */ -int swap_dup_entry_direct(swp_entry_t entry) -{ - int err = 0; - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - return err; -} - -/* - * add_swap_count_continuation - called when a swap count is duplicated - * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's - * page of the original vmalloc'ed swap_map, to hold the continuation count - * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called - * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. - * - * These continuation pages are seldom referenced: the common paths all work - * on the original swap_map, only referring to a continuation page when the - * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. - * - * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding - * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) - * can be called after dropping locks. - */ -int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) -{ - struct swap_info_struct *si; - struct swap_cluster_info *ci; - struct page *head; - struct page *page; - struct page *list_page; - pgoff_t offset; - unsigned char count; - int ret = 0; - - /* - * When debugging, it's easier to use __GFP_ZERO here; but it's better - * for latency not to zero a page while GFP_ATOMIC and holding locks. - */ - page = alloc_page(gfp_mask | __GFP_HIGHMEM); - - si = get_swap_device(entry); - if (!si) { - /* - * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap device may be swapoff - */ - goto outer; - } - - offset = swp_offset(entry); - - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - - if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { - /* - * The higher the swap count, the more likely it is that tasks - * will race to add swap count continuation: we need to avoid - * over-provisioning. - */ - goto out; - } - - if (!page) { - ret = -ENOMEM; - goto out; - } - - head = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - - spin_lock(&si->cont_lock); - /* - * Page allocation does not initialize the page's lru field, - * but it does always reset its private field. - */ - if (!page_private(head)) { - BUG_ON(count & COUNT_CONTINUED); - INIT_LIST_HEAD(&head->lru); - set_page_private(head, SWP_CONTINUED); - si->flags |= SWP_CONTINUED; - } - - list_for_each_entry(list_page, &head->lru, lru) { - unsigned char *map; - - /* - * If the previous map said no continuation, but we've found - * a continuation page, free our allocation and use this one. - */ - if (!(count & COUNT_CONTINUED)) - goto out_unlock_cont; - - map = kmap_local_page(list_page) + offset; - count = *map; - kunmap_local(map); - - /* - * If this continuation count now has some space in it, - * free our allocation and use this one. - */ - if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) - goto out_unlock_cont; - } - - list_add_tail(&page->lru, &head->lru); - page = NULL; /* now it's attached, don't free it */ -out_unlock_cont: - spin_unlock(&si->cont_lock); -out: - swap_cluster_unlock(ci); - put_swap_device(si); -outer: - if (page) - __free_page(page); - return ret; -} - -/* - * swap_count_continued - when the original swap_map count is incremented - * from SWAP_MAP_MAX, check if there is already a continuation page to carry - * into, carry if so, or else fail until a new continuation page is allocated; - * when the original swap_map count is decremented from 0 with continuation, - * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of swap_put_entry_locked() - * holds cluster lock. - */ -static bool swap_count_continued(struct swap_info_struct *si, - pgoff_t offset, unsigned char count) -{ - struct page *head; - struct page *page; - unsigned char *map; - bool ret; - - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head) != SWP_CONTINUED) { - BUG_ON(count & COUNT_CONTINUED); - return false; /* need to add count continuation */ - } - - spin_lock(&si->cont_lock); - offset &= ~PAGE_MASK; - page = list_next_entry(head, lru); - map = kmap_local_page(page) + offset; - - if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ - goto init_map; /* jump over SWAP_CONT_MAX checks */ - - if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ - /* - * Think of how you add 1 to 999 - */ - while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - if (*map == SWAP_CONT_MAX) { - kunmap_local(map); - page = list_next_entry(page, lru); - if (page == head) { - ret = false; /* add count continuation */ - goto out; - } - map = kmap_local_page(page) + offset; -init_map: *map = 0; /* we didn't zero the page */ - } - *map += 1; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = COUNT_CONTINUED; - kunmap_local(map); - } - ret = true; /* incremented */ - - } else { /* decrementing */ - /* - * Think of how you subtract 1 from 1000 - */ - BUG_ON(count != COUNT_CONTINUED); - while (*map == COUNT_CONTINUED) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - BUG_ON(*map == 0); - *map -= 1; - if (*map == 0) - count = 0; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = SWAP_CONT_MAX | count; - count = COUNT_CONTINUED; - kunmap_local(map); - } - ret = count == COUNT_CONTINUED; - } -out: - spin_unlock(&si->cont_lock); - return ret; -} - -/* - * free_swap_count_continuations - swapoff free all the continuation pages - * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. - */ -static void free_swap_count_continuations(struct swap_info_struct *si) -{ - pgoff_t offset; - - for (offset = 0; offset < si->max; offset += PAGE_SIZE) { - struct page *head; - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head)) { - struct page *page, *next; - - list_for_each_entry_safe(page, next, &head->lru, lru) { - list_del(&page->lru); - __free_page(page); - } - } - } + return swap_dup_entries_cluster(si, swp_offset(entry), 1); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) From 45711d446b743da099b7a795ce91ca581d5981a3 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:35 +0800 Subject: [PATCH 051/369] mm, swap: no need to truncate the scan border swap_map had a static flexible size, so the last cluster won't be fully covered, hence the allocator needs to check the scan border to avoid OOB. But the swap table has a fixed-sized swap table for each cluster, and the slots beyond the device size are marked as bad slots. The allocator can simply scan all slots as usual, and any bad slots will be skipped. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-10-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swap.h | 2 +- mm/swapfile.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 0a91e21e92b1..cc410b94e91a 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -85,7 +85,7 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ - VM_WARN_ON_ONCE(offset >= si->max); + VM_WARN_ON_ONCE(offset >= roundup(si->max, SWAPFILE_CLUSTER)); return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } diff --git a/mm/swapfile.c b/mm/swapfile.c index cf976ecae8a8..4442c9040764 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -945,8 +945,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); - unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int order = likely(folio) ? folio_order(folio) : 0; + unsigned long end = start + SWAPFILE_CLUSTER; unsigned int nr_pages = 1 << order; bool need_reclaim, ret, usable; From a0f79916e125f75cf665f5b3ff6ccc1ff60b1a10 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:36 +0800 Subject: [PATCH 052/369] mm, swap: simplify checking if a folio is swapped Clean up and simplify how we check if a folio is swapped. The helper already requires the folio to be in swap cache and locked. That's enough to pin the swap cluster from being freed, so there is no need to lock anything else to avoid UAF. And besides, we have cleaned up and defined the swap operation to be mostly folio based, and now the only place a folio will have any of its swap slots' count increased from 0 to 1 is folio_dup_swap, which also requires the folio lock. So as we are holding the folio lock here, a folio can't change its swap status from not swapped (all swap slots have a count of 0) to swapped (any slot has a swap count larger than 0). So there won't be any false negatives of this helper if we simply depend on the folio lock to stabilize the cluster. We are only using this helper to determine if we can and should release the swap cache. So false positives are completely harmless, and also already exist before. Depending on the timing, previously, it's also possible that a racing thread releases the swap count right after releasing the ci lock and before this helper returns. In any case, the worst that could happen is we leave a clean swap cache. It will still be reclaimed when under pressure just fine. So, in conclusion, we can simplify and make the check much simpler and lockless. Also, rename it to folio_maybe_swapped to reflect the design. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-11-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swap.h | 5 +-- mm/swapfile.c | 84 ++++++++++++++++++++++++++++----------------------- 2 files changed, 49 insertions(+), 40 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index cc410b94e91a..9728e6a944b2 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -195,12 +195,13 @@ extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp); * * folio_alloc_swap(): the entry point for a folio to be swapped * out. It allocates swap slots and pins the slots with swap cache. - * The slots start with a swap count of zero. + * The slots start with a swap count of zero. The slots are pinned + * by swap cache reference which doesn't contribute to swap count. * * folio_dup_swap(): increases the swap count of a folio, usually * during it gets unmapped and a swap entry is installed to replace * it (e.g., swap entry in page table). A swap slot with swap - * count == 0 should only be increasd by this helper. + * count == 0 can only be increased by this helper. * * folio_put_swap(): does the opposite thing of folio_dup_swap(). */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 4442c9040764..f9ba89cb290c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1743,7 +1743,11 @@ again: * @subpage: if not NULL, only increase the swap count of this subpage. * * Typically called when the folio is unmapped and have its swap entry to - * take its palce. + * take its place: Swap entries allocated to a folio has count == 0 and pinned + * by swap cache. The swap cache pin doesn't increase the swap count. This + * helper sets the initial count == 1 and increases the count as the folio is + * unmapped and swap entries referencing the slots are generated to replace + * the folio. * * Context: Caller must ensure the folio is locked and in the swap cache. * NOTE: The caller also has to ensure there is no raced call to @@ -1942,49 +1946,44 @@ int swp_swapcount(swp_entry_t entry) return count < 0 ? 0 : count; } -static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, - swp_entry_t entry, int order) -{ - struct swap_cluster_info *ci; - unsigned int nr_pages = 1 << order; - unsigned long roffset = swp_offset(entry); - unsigned long offset = round_down(roffset, nr_pages); - unsigned int ci_off; - int i; - bool ret = false; - - ci = swap_cluster_lock(si, offset); - if (nr_pages == 1) { - ci_off = roffset % SWAPFILE_CLUSTER; - if (swp_tb_get_count(__swap_table_get(ci, ci_off))) - ret = true; - goto unlock_out; - } - for (i = 0; i < nr_pages; i++) { - ci_off = (offset + i) % SWAPFILE_CLUSTER; - if (swp_tb_get_count(__swap_table_get(ci, ci_off))) { - ret = true; - break; - } - } -unlock_out: - swap_cluster_unlock(ci); - return ret; -} - -static bool folio_swapped(struct folio *folio) +/* + * folio_maybe_swapped - Test if a folio covers any swap slot with count > 0. + * + * Check if a folio is swapped. Holding the folio lock ensures the folio won't + * go from not-swapped to swapped because the initial swap count increment can + * only be done by folio_dup_swap, which also locks the folio. But a concurrent + * decrease of swap count is possible through swap_put_entries_direct, so this + * may return a false positive. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + */ +static bool folio_maybe_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; - struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned int ci_off, ci_end; + bool ret = false; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); - si = __swap_entry_to_info(entry); - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return swap_entry_swapped(si, entry); + ci = __swap_entry_to_cluster(entry); + ci_off = swp_cluster_offset(entry); + ci_end = ci_off + folio_nr_pages(folio); + /* + * Extra locking not needed, folio lock ensures its swap entries + * won't be released, the backing data won't be gone either. + */ + rcu_read_lock(); + do { + if (__swp_tb_get_count(__swap_table_get(ci, ci_off))) { + ret = true; + break; + } + } while (++ci_off < ci_end); + rcu_read_unlock(); - return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); + return ret; } static bool folio_swapcache_freeable(struct folio *folio) @@ -2030,7 +2029,7 @@ bool folio_free_swap(struct folio *folio) { if (!folio_swapcache_freeable(folio)) return false; - if (folio_swapped(folio)) + if (folio_maybe_swapped(folio)) return false; swap_cache_del_folio(folio); @@ -3703,6 +3702,8 @@ void si_swapinfo(struct sysinfo *val) * * Context: Caller must ensure there is no race condition on the reference * owner. e.g., locking the PTL of a PTE containing the entry being increased. + * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should + * be used. */ int swap_dup_entry_direct(swp_entry_t entry) { @@ -3714,6 +3715,13 @@ int swap_dup_entry_direct(swp_entry_t entry) return -EINVAL; } + /* + * The caller must be increasing the swap count from a direct + * reference of the swap slot (e.g. a swap entry in page table). + * So the swap count must be >= 1. + */ + VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry)); + return swap_dup_entries_cluster(si, swp_offset(entry), 1); } From 1df1a1b950863e64c00d48df718ed7ed28db3ea3 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Feb 2026 04:06:37 +0800 Subject: [PATCH 053/369] mm, swap: no need to clear the shadow explicitly Since we no longer bypass the swap cache, every swap-in will clear the swap shadow by inserting the folio into the swap table. The only place we may seem to need to free the swap shadow is when the swap slots are freed directly without a folio (swap_put_entries_direct). But with the swap table, that is not needed either. Freeing a slot in the swap table will set the table entry to NULL, which erases the shadow just fine. So just delete all explicit shadow clearing, it's no longer needed. Also, rearrange the freeing. Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-12-f4e34be021a7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kairui Song Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swap.h | 1 - mm/swap_state.c | 21 --------------------- mm/swapfile.c | 2 -- 3 files changed, 24 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 9728e6a944b2..a77016f2423b 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -290,7 +290,6 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow); void __swap_cache_replace_folio(struct swap_cluster_info *ci, struct folio *old, struct folio *new); -void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents); void show_swap_cache_info(void); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); diff --git a/mm/swap_state.c b/mm/swap_state.c index e7618ffe6d70..32d9d877bda8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -350,27 +350,6 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, } } -/** - * __swap_cache_clear_shadow - Clears a set of shadows in the swap cache. - * @entry: The starting index entry. - * @nr_ents: How many slots need to be cleared. - * - * Context: Caller must ensure the range is valid, all in one single cluster, - * not occupied by any folio, and lock the cluster. - */ -void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) -{ - struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); - unsigned int ci_off = swp_cluster_offset(entry), ci_end; - unsigned long old; - - ci_end = ci_off + nr_ents; - do { - old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); - WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old)); - } while (++ci_off < ci_end); -} - /* * If we are the only user, then try to free up the swap cache. * diff --git a/mm/swapfile.c b/mm/swapfile.c index f9ba89cb290c..915bc93964db 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1287,7 +1287,6 @@ static void swap_range_alloc(struct swap_info_struct *si, static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { - unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; @@ -1312,7 +1311,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 From 1beb9b7223d2a1f1872f76a3d29b0a4a3cee4171 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 19:59:32 +0100 Subject: [PATCH 054/369] memfd: export memfd_{add,get}_seals() Patch series "mm: memfd_luo: preserve file seals", v2. This series adds support for preserving file seals when preserving a memfd using LUO. Patch 1 exports some memfd seal manipulation functions and patch 2 adds support for preserving them. Since it makes changes to the serialized data structure for memfd, it also bumps the version number. This patch (of 2): Support for preserving file seals will be added to memfd preservation using the Live Update Orchestrator (LUO). Export memfd_{add,get}_seals)() so memfd_luo can use them to manipulate the seals. Link: https://lkml.kernel.org/r/20260216185946.1215770-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20260216185946.1215770-2-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Acked-by: Mike Rapoport (Microsoft) Tested-by: Samiullah Khawaja Cc: Alexander Graf Cc: Baolin Wang Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/memfd.h | 12 ++++++++++++ mm/memfd.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/linux/memfd.h b/include/linux/memfd.h index c328a7b356d0..b4fda09dab9f 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -18,6 +18,8 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); */ int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr); struct file *memfd_alloc_file(const char *name, unsigned int flags); +int memfd_get_seals(struct file *file); +int memfd_add_seals(struct file *file, unsigned int seals); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -37,6 +39,16 @@ static inline struct file *memfd_alloc_file(const char *name, unsigned int flags { return ERR_PTR(-EINVAL); } + +static inline int memfd_get_seals(struct file *file) +{ + return -EINVAL; +} + +static inline int memfd_add_seals(struct file *file, unsigned int seals) +{ + return -EINVAL; +} #endif #endif /* __LINUX_MEMFD_H */ diff --git a/mm/memfd.c b/mm/memfd.c index 919c2a53eb96..fb425f4e315f 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -227,7 +227,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) F_SEAL_WRITE | \ F_SEAL_FUTURE_WRITE) -static int memfd_add_seals(struct file *file, unsigned int seals) +int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; @@ -309,7 +309,7 @@ unlock: return error; } -static int memfd_get_seals(struct file *file) +int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); From 8a552d68a86ef0e6fb2ff4af13031a5e82c0f1d0 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 16 Feb 2026 19:59:33 +0100 Subject: [PATCH 055/369] mm: memfd_luo: preserve file seals File seals are used on memfd for making shared memory communication with untrusted peers safer and simpler. Seals provide a guarantee that certain operations won't be allowed on the file such as writes or truncations. Maintaining these guarantees across a live update will help keeping such use cases secure. These guarantees will also be needed for IOMMUFD preservation with LUO. Normally when IOMMUFD maps a memfd, it pins all its pages to make sure any truncation operations on the memfd don't lead to IOMMUFD using freed memory. This doesn't work with LUO since the preserved memfd might have completely different pages after a live update, and mapping them back to the IOMMUFD will cause all sorts of problems. Using and preserving the seals allows IOMMUFD preservation logic to trust the memfd. Since the uABI defines seals as an int, preserve them by introducing a new u32 field. There are currently only 6 possible seals, so the extra bits are unused and provide room for future expansion. Since the seals are uABI, it is safe to use them directly in the ABI. While at it, also add a u32 flags field. It makes sure the struct is nicely aligned, and can be used later to support things like MFD_CLOEXEC. Since the serialization structure is changed, bump the version number to "memfd-v2". It is important to note that the memfd-v2 version only supports seals that existed when this version was defined. This set is defined by MEMFD_LUO_ALL_SEALS. Any new seal might bring a completely different semantic with it and the parser for memfd-v2 cannot be expected to deal with that. If there are any future seals added, they will need another version bump. Link: https://lkml.kernel.org/r/20260216185946.1215770-3-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Tested-by: Samiullah Khawaja Cc: Alexander Graf Cc: Baolin Wang Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/kho/abi/memfd.h | 18 +++++++++++++++++- mm/memfd_luo.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h index 68cb6303b846..08b10fea2afc 100644 --- a/include/linux/kho/abi/memfd.h +++ b/include/linux/kho/abi/memfd.h @@ -56,10 +56,24 @@ struct memfd_luo_folio_ser { u64 index; } __packed; +/* + * The set of seals this version supports preserving. If support for any new + * seals is needed, add it here and bump version. + */ +#define MEMFD_LUO_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE | \ + F_SEAL_EXEC) + /** * struct memfd_luo_ser - Main serialization structure for a memfd. * @pos: The file's current position (f_pos). * @size: The total size of the file in bytes (i_size). + * @seals: The seals present on the memfd. The seals are uABI so it is safe + * to directly use them in the ABI. + * @flags: Flags for the file. Unused flag bits must be set to 0. * @nr_folios: Number of folios in the folios array. * @folios: KHO vmalloc descriptor pointing to the array of * struct memfd_luo_folio_ser. @@ -67,11 +81,13 @@ struct memfd_luo_folio_ser { struct memfd_luo_ser { u64 pos; u64 size; + u32 seals; + u32 flags; u64 nr_folios; struct kho_vmalloc folios; } __packed; /* The compatibility string for memfd file handler */ -#define MEMFD_LUO_FH_COMPATIBLE "memfd-v1" +#define MEMFD_LUO_FH_COMPATIBLE "memfd-v2" #endif /* _LINUX_KHO_ABI_MEMFD_H */ diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index b8edb9f981d7..bc7f4f045edf 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -79,6 +79,8 @@ #include #include #include +#include + #include "internal.h" static int memfd_luo_preserve_folios(struct file *file, @@ -259,7 +261,7 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; u64 nr_folios; - int err = 0; + int err = 0, seals; inode_lock(inode); shmem_freeze(inode, true); @@ -271,8 +273,21 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) goto err_unlock; } + seals = memfd_get_seals(args->file); + if (seals < 0) { + err = seals; + goto err_free_ser; + } + + /* Make sure the file only has the seals supported by this version. */ + if (seals & ~MEMFD_LUO_ALL_SEALS) { + err = -EOPNOTSUPP; + goto err_free_ser; + } + ser->pos = args->file->f_pos; ser->size = i_size_read(inode); + ser->seals = seals; err = memfd_luo_preserve_folios(args->file, &ser->folios, &folios_ser, &nr_folios); @@ -486,13 +501,29 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) if (!ser) return -EINVAL; - file = memfd_alloc_file("", 0); + /* Make sure the file only has seals supported by this version. */ + if (ser->seals & ~MEMFD_LUO_ALL_SEALS) { + err = -EOPNOTSUPP; + goto free_ser; + } + + /* + * The seals are preserved. Allow sealing here so they can be added + * later. + */ + file = memfd_alloc_file("", MFD_ALLOW_SEALING); if (IS_ERR(file)) { pr_err("failed to setup file: %pe\n", file); err = PTR_ERR(file); goto free_ser; } + err = memfd_add_seals(file, ser->seals); + if (err) { + pr_err("failed to add seals: %pe\n", ERR_PTR(err)); + goto put_file; + } + vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); file->f_inode->i_size = ser->size; From c9cb94c6b85a2854ae03c874331b0880ee735441 Mon Sep 17 00:00:00 2001 From: Asier Gutierrez Date: Fri, 13 Feb 2026 14:50:32 +0000 Subject: [PATCH 056/369] mm/damon: remove unused target param of get_scheme_score() damon_target is not used by get_scheme_score operations, nor with virtual neither with physical addresses. Link: https://lkml.kernel.org/r/20260213145032.1740407-1-gutierrez.asier@huawei-partners.com Signed-off-by: Asier Gutierrez Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: Quanmin Yan Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +-- mm/damon/core.c | 10 +++++----- mm/damon/paddr.c | 3 +-- mm/damon/vaddr.c | 3 +-- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index be3d198043ff..60e6da3012fa 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -647,8 +647,7 @@ struct damon_operations { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damon_region *r, struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed); diff --git a/mm/damon/core.c b/mm/damon/core.c index 3e1890d64d06..0e5ada441b05 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1689,15 +1689,15 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) r->age <= s->pattern.max_age_region; } -static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, - struct damon_region *r, struct damos *s) +static bool damos_valid_target(struct damon_ctx *c, struct damon_region *r, + struct damos *s) { bool ret = __damos_valid_target(r, s); if (!ret || !s->quota.esz || !c->ops.get_scheme_score) return ret; - return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; + return c->ops.get_scheme_score(c, r, s) >= s->quota.min_score; } /* @@ -2021,7 +2021,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, s->max_nr_snapshots <= s->stat.nr_snapshots) continue; - if (damos_valid_target(c, t, r, s)) + if (damos_valid_target(c, r, s)) damos_apply_scheme(c, t, r, s); if (damon_is_last_region(r, t)) @@ -2319,7 +2319,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; - score = c->ops.get_scheme_score(c, t, r, s); + score = c->ops.get_scheme_score(c, r, s); c->regions_score_histogram[score] += damon_sz_region(r); if (score > max_score) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 9bfe48826840..5cdcc5037cbc 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -343,8 +343,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, } static int damon_pa_scheme_score(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damon_region *r, struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 729b7ffd3565..4d6d8251d419 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -985,8 +985,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, } static int damon_va_scheme_score(struct damon_ctx *context, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damon_region *r, struct damos *scheme) { switch (scheme->action) { From 37cb8cd043cbcbe5c617340cff1684cf2f68fb58 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 13 Feb 2026 18:03:32 +0800 Subject: [PATCH 057/369] memcg: consolidate private id refcount get/put helpers We currently have two different sets of helpers for getting or putting the private IDs' refcount for order 0 and large folios. This is redundant. Just use one and always acquire the refcount of the swapout folio size unless it's zero, and put the refcount using the folio size if the charge failed, since the folio size can't change. Then there is no need to update the refcount for tail pages. Same for freeing, then only one pair of get/put helper is needed now. The performance might be slightly better, too: both "inc unless zero" and "add unless zero" use the same cmpxchg implementation. For large folios, we saved an atomic operation. And for both order 0 and large folios, we saved a branch. Link: https://lkml.kernel.org/r/20260213-memcg-privid-v1-1-d8cb7afcf831@tencent.com Signed-off-by: Kairui Song Acked-by: Johannes Weiner Reviewed-by: Chen Ridong Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol-v1.c | 5 +---- mm/memcontrol-v1.h | 4 ++-- mm/memcontrol.c | 29 +++++++---------------------- 3 files changed, 10 insertions(+), 28 deletions(-) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 597af8a80163..437cd25784fe 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -635,11 +635,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) * have an ID allocated to it anymore, charge the closest online * ancestor for the swap instead and transfer the memory+swap charge. */ - swap_memcg = mem_cgroup_private_id_get_online(memcg); nr_entries = folio_nr_pages(folio); - /* Get references for the tail pages, too */ - if (nr_entries > 1) - mem_cgroup_private_id_get_many(swap_memcg, nr_entries - 1); + swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index eb3c3c105657..1b969294ea6a 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -27,8 +27,8 @@ void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); int memory_stat_show(struct seq_file *m, void *v); -void mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n); -struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg); +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, + unsigned int n); /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index af75f10150a8..823ac6a05bf3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3634,13 +3634,7 @@ static void mem_cgroup_private_id_remove(struct mem_cgroup *memcg) } } -void __maybe_unused mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, - unsigned int n) -{ - refcount_add(n, &memcg->id.ref); -} - -static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned int n) +static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg, unsigned int n) { if (refcount_sub_and_test(n, &memcg->id.ref)) { mem_cgroup_private_id_remove(memcg); @@ -3650,14 +3644,9 @@ static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned in } } -static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg) +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, unsigned int n) { - mem_cgroup_private_id_put_many(memcg, 1); -} - -struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg) -{ - while (!refcount_inc_not_zero(&memcg->id.ref)) { + while (!refcount_add_not_zero(n, &memcg->id.ref)) { /* * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. @@ -3957,7 +3946,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); - mem_cgroup_private_id_put(memcg); + mem_cgroup_private_id_put(memcg, 1); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -5247,19 +5236,15 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) return 0; } - memcg = mem_cgroup_private_id_get_online(memcg); + memcg = mem_cgroup_private_id_get_online(memcg, nr_pages); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - mem_cgroup_private_id_put(memcg); + mem_cgroup_private_id_put(memcg, nr_pages); return -ENOMEM; } - - /* Get references for the tail pages, too */ - if (nr_pages > 1) - mem_cgroup_private_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); @@ -5288,7 +5273,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); - mem_cgroup_private_id_put_many(memcg, nr_pages); + mem_cgroup_private_id_put(memcg, nr_pages); } rcu_read_unlock(); } From 5ad41a38c36474ff59545cb514801d90719555de Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 13 Feb 2026 15:18:22 +0800 Subject: [PATCH 058/369] mm: zswap: add per-memcg stat for incompressible pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: zswap: add per-memcg stat for incompressible pages", v3. In containerized environments, knowing which cgroup is contributing incompressible pages to zswap is essential for effective resource management. This series adds a new per-memcg stat 'zswap_incomp' to track incompressible pages, along with a selftest. This patch (of 2): The global zswap_stored_incompressible_pages counter was added in commit dca4437a5861 ("mm/zswap: store Acked-by: Nhat Pham Acked-by: Shakeel Butt Reviewed-by: Yosry Ahmed Reviewed-by: SeongJae Park Cc: Johannes Weiner Cc: Chengming Zhou Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 5 +++++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 91beaa6798ce..8ad0b2781317 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1734,6 +1734,11 @@ The following nested keys are defined. zswpwb Number of pages written from zswap to swap. + zswap_incomp + Number of incompressible pages currently stored in zswap + without compression. These pages could not be compressed to + a size smaller than PAGE_SIZE, so they are stored as-is. + thp_fault_alloc (npn) Number of transparent hugepages which were allocated to satisfy a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 70b685a85bf4..5695776f32c8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -39,6 +39,7 @@ enum memcg_stat_item { MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, + MEMCG_ZSWAP_INCOMP, MEMCG_NR_STAT, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 823ac6a05bf3..75df24ffdf25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -356,6 +356,7 @@ static const unsigned int memcg_stat_items[] = { MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, + MEMCG_ZSWAP_INCOMP, }; #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) @@ -1368,6 +1369,7 @@ static const struct memory_stat memory_stats[] = { #ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, { "zswapped", MEMCG_ZSWAPPED }, + { "zswap_incomp", MEMCG_ZSWAP_INCOMP }, #endif { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -5520,6 +5522,8 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); + if (size == PAGE_SIZE) + mod_memcg_state(memcg, MEMCG_ZSWAP_INCOMP, 1); rcu_read_unlock(); } @@ -5543,6 +5547,8 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); + if (size == PAGE_SIZE) + mod_memcg_state(memcg, MEMCG_ZSWAP_INCOMP, -1); rcu_read_unlock(); } From 4e89004eebc559595e58d75fd7f7f8ecd5aa600d Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 13 Feb 2026 15:18:23 +0800 Subject: [PATCH 059/369] selftests/cgroup: add test for zswap incompressible pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add test_zswap_incompressible() to verify that the zswap_incomp memcg stat correctly tracks incompressible pages. The test allocates memory filled with random data from /dev/urandom, which cannot be effectively compressed by zswap. When this data is swapped out to zswap, it should be stored as-is and tracked by the zswap_incomp counter. The test verifies that: 1. Pages are swapped out to zswap (zswpout increases) 2. Incompressible pages are tracked (zswap_incomp increases) test: dd if=/dev/zero of=/swapfile bs=1M count=2048 chmod 600 /swapfile mkswap /swapfile swapon /swapfile echo Y > /sys/module/zswap/parameters/enabled ./test_zswap TAP version 13 1..8 ok 1 test_zswap_usage ok 2 test_swapin_nozswap ok 3 test_zswapin ok 4 test_zswap_writeback_enabled ok 5 test_zswap_writeback_disabled ok 6 test_no_kmem_bypass ok 7 test_no_invasive_cgroup_shrink ok 8 test_zswap_incompressible Totals: pass:8 fail:0 xfail:0 xpass:0 skip:0 error:0 Link: https://lkml.kernel.org/r/20260213071827.5688-3-jiayuan.chen@linux.dev Signed-off-by: Jiayuan Chen Acked-by: Shakeel Butt Acked-by: Nhat Pham Reviewed-by: SeongJae Park Cc: Chengming Zhou Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 136 ++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 64ebc3f3f203..a7bdcdd09d62 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -574,6 +576,139 @@ out: return ret; } +struct incomp_child_args { + size_t size; + int pipefd[2]; + int madvise_ret; + int madvise_errno; +}; + +static int allocate_random_and_wait(const char *cgroup, void *arg) +{ + struct incomp_child_args *values = arg; + size_t size = values->size; + char *mem; + int fd; + ssize_t n; + + close(values->pipefd[0]); + + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) + return -1; + + /* Fill with random data from /dev/urandom - incompressible */ + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + munmap(mem, size); + return -1; + } + + for (size_t i = 0; i < size; ) { + n = read(fd, mem + i, size - i); + if (n <= 0) + break; + i += n; + } + close(fd); + + /* Touch all pages to ensure they're faulted in */ + for (size_t i = 0; i < size; i += PAGE_SIZE) + mem[i] = mem[i]; + + /* Use MADV_PAGEOUT to push pages into zswap */ + values->madvise_ret = madvise(mem, size, MADV_PAGEOUT); + values->madvise_errno = errno; + + /* Notify parent that allocation and pageout are done */ + write(values->pipefd[1], "x", 1); + close(values->pipefd[1]); + + /* Keep memory alive for parent to check stats */ + pause(); + munmap(mem, size); + return 0; +} + +static long get_zswap_incomp(const char *cgroup) +{ + return cg_read_key_long(cgroup, "memory.stat", "zswap_incomp "); +} + +/* + * Test that incompressible pages (random data) are tracked by zswap_incomp. + * + * The child process allocates random data within memory.max, then uses + * MADV_PAGEOUT to push pages into zswap. The parent waits on a pipe for + * the child to finish, then checks the zswap_incomp stat before the child + * exits (zswap_incomp is a gauge that decreases on free). + */ +static int test_zswap_incompressible(const char *root) +{ + int ret = KSFT_FAIL; + struct incomp_child_args *values; + char *test_group; + long zswap_incomp; + pid_t child_pid; + int child_status; + char buf; + + values = mmap(0, sizeof(struct incomp_child_args), PROT_READ | + PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (values == MAP_FAILED) + return KSFT_FAIL; + + if (pipe(values->pipefd)) { + munmap(values, sizeof(struct incomp_child_args)); + return KSFT_FAIL; + } + + test_group = cg_name(root, "zswap_incompressible_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "32M")) + goto out; + + values->size = MB(4); + child_pid = cg_run_nowait(test_group, allocate_random_and_wait, values); + if (child_pid < 0) + goto out; + + close(values->pipefd[1]); + + /* Wait for child to finish allocating and pageout */ + read(values->pipefd[0], &buf, 1); + close(values->pipefd[0]); + + zswap_incomp = get_zswap_incomp(test_group); + if (zswap_incomp <= 0) { + long zswpout = get_zswpout(test_group); + long zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped "); + long zswap_b = cg_read_key_long(test_group, "memory.stat", "zswap "); + + ksft_print_msg("zswap_incomp not increased: %ld\n", zswap_incomp); + ksft_print_msg("debug: zswpout=%ld zswapped=%ld zswap_b=%ld\n", + zswpout, zswapped, zswap_b); + ksft_print_msg("debug: madvise ret=%d errno=%d\n", + values->madvise_ret, values->madvise_errno); + goto out_kill; + } + + ret = KSFT_PASS; + +out_kill: + kill(child_pid, SIGTERM); + waitpid(child_pid, &child_status, 0); +out: + cg_destroy(test_group); + free(test_group); + munmap(values, sizeof(struct incomp_child_args)); + return ret; +} + #define T(x) { x, #x } struct zswap_test { int (*fn)(const char *root); @@ -586,6 +721,7 @@ struct zswap_test { T(test_zswap_writeback_disabled), T(test_no_kmem_bypass), T(test_no_invasive_cgroup_shrink), + T(test_zswap_incompressible), }; #undef T From c5c48345135ff04e039377020df23294d59aa59a Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Wed, 11 Feb 2026 16:54:47 -0500 Subject: [PATCH 060/369] mm: name the anonymous MMOP enum as enum mmop Give the MMOP enum (MMOP_OFFLINE, MMOP_ONLINE, etc) a proper type name so the compiler can help catch invalid values being assigned to variables of this type. Leave the existing functions returning int alone to allow for value-or-error pattern to remain unchanged without churn. mmop_default_online_type is left as int because it uses the -1 sentinal value to signal it hasn't been initialized yet. Keep the uint8_t buffer in offline_and_remove_memory() as-is for space efficiency, with an explicit cast when we consume the value. Move the enum definition before the CONFIG_MEMORY_HOTPLUG guard so it is unconditionally available for struct memory_block in memory.h. No functional change. Link: https://lore.kernel.org/linux-mm/3424eba7-523b-4351-abd0-3a888a3e5e61@kernel.org/ Link: https://lkml.kernel.org/r/20260211215447.2194189-1-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Jonathan Cameron Suggested-by: "David Hildenbrand (arm)" Reviewed-by: Ben Cheatham Acked-by: David Hildenbrand (Arm) Reviewed-by: Dave Jiang Reviewed-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/base/memory.c | 2 +- include/linux/memory.h | 3 ++- include/linux/memory_hotplug.h | 16 ++++++++-------- mm/memory_hotplug.c | 10 +++++----- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index a3091924918b..5380050b16b7 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -452,7 +452,7 @@ static ssize_t phys_device_show(struct device *dev, static int print_allowed_zone(char *buf, int len, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages, - int online_type, struct zone *default_zone) + enum mmop online_type, struct zone *default_zone) { struct zone *zone; diff --git a/include/linux/memory.h b/include/linux/memory.h index faeaa921e55b..5bb5599c6b2b 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,6 +19,7 @@ #include #include #include +#include #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -77,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; enum memory_block_state state; /* serialized by the dev->lock */ - int online_type; /* for passing data to online routine */ + enum mmop online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* * The single zone of this memory block if all PFNs of this memory block diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f2f16cdd73ee..e77ef3d7ff73 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,11 +16,8 @@ struct resource; struct vmem_altmap; struct dev_pagemap; -#ifdef CONFIG_MEMORY_HOTPLUG -struct page *pfn_to_online_page(unsigned long pfn); - /* Types for control the zone type of onlined and offlined memory */ -enum { +enum mmop { /* Offline the memory. */ MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ @@ -31,6 +28,9 @@ enum { MMOP_ONLINE_MOVABLE, }; +#ifdef CONFIG_MEMORY_HOTPLUG +struct page *pfn_to_online_page(unsigned long pfn); + /* Flags for add_memory() and friends to specify memory hotplug details. */ typedef int __bitwise mhp_t; @@ -286,8 +286,8 @@ static inline void __remove_memory(u64 start, u64 size) {} #ifdef CONFIG_MEMORY_HOTPLUG /* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_get_default_online_type(void); -extern void mhp_set_default_online_type(int online_type); +extern enum mmop mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(enum mmop online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); @@ -310,8 +310,8 @@ extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern struct zone *zone_for_pfn_range(int online_type, int nid, - struct memory_group *group, unsigned long start_pfn, +extern struct zone *zone_for_pfn_range(enum mmop online_type, + int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bc805029da51..a602310bdf33 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -221,7 +221,7 @@ void put_online_mems(void) bool movable_node_enabled = false; static int mhp_default_online_type = -1; -int mhp_get_default_online_type(void) +enum mmop mhp_get_default_online_type(void) { if (mhp_default_online_type >= 0) return mhp_default_online_type; @@ -240,7 +240,7 @@ int mhp_get_default_online_type(void) return mhp_default_online_type; } -void mhp_set_default_online_type(int online_type) +void mhp_set_default_online_type(enum mmop online_type) { mhp_default_online_type = online_type; } @@ -1046,7 +1046,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone *zone_for_pfn_range(int online_type, int nid, +struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages) { @@ -2305,7 +2305,7 @@ EXPORT_SYMBOL_GPL(remove_memory); static int try_offline_memory_block(struct memory_block *mem, void *arg) { - uint8_t online_type = MMOP_ONLINE_KERNEL; + enum mmop online_type = MMOP_ONLINE_KERNEL; uint8_t **online_types = arg; struct page *page; int rc; @@ -2338,7 +2338,7 @@ static int try_reonline_memory_block(struct memory_block *mem, void *arg) int rc; if (**online_types != MMOP_OFFLINE) { - mem->online_type = **online_types; + mem->online_type = (enum mmop)**online_types; rc = device_online(&mem->dev); if (rc < 0) pr_warn("%s: Failed to re-online memory: %d", From 7498bddab9455b3750b655c27cdcc0a2c48af318 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 11 Feb 2026 22:33:23 +0800 Subject: [PATCH 061/369] mm/shmem: remove unnecessary restrain unmask of swap gfp flags The comment makes it look like copy-paste leftovers from shmem_replace_folio. The first try of the swap doesn't always have a limited zone. So don't drop the restraint, which should make the GFP more accurate. Link: https://lkml.kernel.org/r/20260211-shmem-swap-gfp-v1-1-e9781099a861@tencent.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b40f3cd48961..cfed6c3ff853 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2044,14 +2044,8 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); struct folio *new, *swapcache; int nr_pages = 1 << order; - gfp_t alloc_gfp; + gfp_t alloc_gfp = gfp; - /* - * We have arrived here because our zones are constrained, so don't - * limit chance of success with further cpuset and node constraints. - */ - gfp &= ~GFP_CONSTRAINT_MASK; - alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); From 652d12bc74a075f345f228f8945e05517a38874d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:38 +0200 Subject: [PATCH 062/369] mm: don't special case !MMU for is_zero_pfn() and my_zero_pfn() Patch series "arch, mm: consolidate empty_zero_page", v3. These patches cleanup handling of ZERO_PAGE() and zero_pfn. This patch (of 4): nommu architectures have empty_zero_page and define ZERO_PAGE() and although they don't really use it to populate page tables, there is no reason to hardwire !MMU implementation of is_zero_pfn() and my_zero_pfn() to 0. Drop #ifdef CONFIG_MMU around implementations of is_zero_pfn() and my_zero_pfn() and remove !MMU version. While on it, make zero_pfn __ro_after_init. Link: https://lkml.kernel.org/r/20260211103141.3215197-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20260211103141.3215197-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 +------------- mm/memory.c | 13 ------------- mm/mm_init.c | 10 ++++++++++ 3 files changed, 11 insertions(+), 26 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a50df42a893f..5e772599d9a5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1917,7 +1917,6 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } -#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1940,18 +1939,7 @@ static inline unsigned long my_zero_pfn(unsigned long addr) extern unsigned long zero_pfn; return zero_pfn; } -#endif -#else -static inline int is_zero_pfn(unsigned long pfn) -{ - return 0; -} - -static inline unsigned long my_zero_pfn(unsigned long addr) -{ - return 0; -} -#endif /* CONFIG_MMU */ +#endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_MMU diff --git a/mm/memory.c b/mm/memory.c index 7084c426f933..6b504fc5e815 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,21 +162,8 @@ static int __init disable_randmaps(char *s) } __setup("norandmaps", disable_randmaps); -unsigned long zero_pfn __read_mostly; -EXPORT_SYMBOL(zero_pfn); - unsigned long highest_memmap_pfn __read_mostly; -/* - * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() - */ -static int __init init_zero_pfn(void) -{ - zero_pfn = page_to_pfn(ZERO_PAGE(0)); - return 0; -} -early_initcall(init_zero_pfn); - void mm_trace_rss_stat(struct mm_struct *mm, int member) { trace_rss_stat(mm, member); diff --git a/mm/mm_init.c b/mm/mm_init.c index df34797691bd..f3755a66b9d0 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,6 +53,9 @@ EXPORT_SYMBOL(mem_map); void *high_memory; EXPORT_SYMBOL(high_memory); +unsigned long zero_pfn __ro_after_init; +EXPORT_SYMBOL(zero_pfn); + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -2672,6 +2675,13 @@ static void __init mem_init_print_info(void) ); } +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +early_initcall(init_zero_pfn); + void __init __weak arch_mm_preinit(void) { } From 9a1d0c738b45ea8da4e6897099c708e89f43daad Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:39 +0200 Subject: [PATCH 063/369] mm: rename my_zero_pfn() to zero_pfn() my_zero_pfn() is a silly name. Rename zero_pfn variable to zero_page_pfn and my_zero_pfn() function to zero_pfn(). While on it, move extern declarations of zero_page_pfn outside the functions that use it and add a comment about what ZERO_PAGE is. Link: https://lkml.kernel.org/r/20260211103141.3215197-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/kvm/mmu/spte.h | 2 +- fs/dax.c | 2 +- fs/proc/vmcore.c | 2 +- include/linux/pgtable.h | 28 ++++++++++++++++++++-------- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- mm/migrate.c | 2 +- mm/mm_init.c | 10 +++++----- mm/userfaultfd.c | 4 ++-- 9 files changed, 33 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 91ce29fd6f1b..8c0ffa2cded6 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -248,7 +248,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static inline hpa_t kvm_mmu_get_dummy_root(void) { - return my_zero_pfn(0) << PAGE_SHIFT; + return zero_pfn(0) << PAGE_SHIFT; } static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) diff --git a/fs/dax.c b/fs/dax.c index 289e6254aa30..b78cff9c91b3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1360,7 +1360,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, { struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; - unsigned long pfn = my_zero_pfn(vaddr); + unsigned long pfn = zero_pfn(vaddr); vm_fault_t ret; *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f188bd900eb2..44d15436439f 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -525,7 +525,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, { unsigned long map_size; unsigned long pos_start, pos_end, pos; - unsigned long zeropage_pfn = my_zero_pfn(0); + unsigned long zeropage_pfn = zero_pfn(0); size_t len = 0; pos_start = pfn; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772599d9a5..c3a56f6b1ea5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1917,27 +1917,39 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } +/* + * ZERO_PAGE() is global shared page(s) that is always zero. It is used for + * zero-mapped memory areas, CoW etc. + * + * On architectures that __HAVE_COLOR_ZERO_PAGE there are several such pages + * for different ranges in the virtual address space. + * + * zero_page_pfn identifies the first (or the only) pfn for these pages. + */ #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { - extern unsigned long zero_pfn; - unsigned long offset_from_zero_pfn = pfn - zero_pfn; + extern unsigned long zero_page_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_page_pfn; + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } -#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) +#define zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { - extern unsigned long zero_pfn; - return pfn == zero_pfn; + extern unsigned long zero_page_pfn; + + return pfn == zero_page_pfn; } -static inline unsigned long my_zero_pfn(unsigned long addr) +static inline unsigned long zero_pfn(unsigned long addr) { - extern unsigned long zero_pfn; - return zero_pfn; + extern unsigned long zero_page_pfn; + + return zero_page_pfn; } #endif /* __HAVE_COLOR_ZERO_PAGE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b298cba853ab..a132fb98ed5d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2972,7 +2972,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry; - entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); + entry = pfn_pte(zero_pfn(addr), vma->vm_page_prot); entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); diff --git a/mm/memory.c b/mm/memory.c index 6b504fc5e815..af26a697562b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5224,7 +5224,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), + entry = pte_mkspecial(pfn_pte(zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); diff --git a/mm/migrate.c b/mm/migrate.c index 2c3d489ecf51..6cc654858da6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -321,7 +321,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, if (!pages_identical(page, ZERO_PAGE(0))) return false; - newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), + newpte = pte_mkspecial(pfn_pte(zero_pfn(pvmw->address), pvmw->vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) diff --git a/mm/mm_init.c b/mm/mm_init.c index f3755a66b9d0..ab6578516dd6 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,8 +53,8 @@ EXPORT_SYMBOL(mem_map); void *high_memory; EXPORT_SYMBOL(high_memory); -unsigned long zero_pfn __ro_after_init; -EXPORT_SYMBOL(zero_pfn); +unsigned long zero_page_pfn __ro_after_init; +EXPORT_SYMBOL(zero_page_pfn); #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -2675,12 +2675,12 @@ static void __init mem_init_print_info(void) ); } -static int __init init_zero_pfn(void) +static int __init init_zero_page_pfn(void) { - zero_pfn = page_to_pfn(ZERO_PAGE(0)); + zero_page_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } -early_initcall(init_zero_pfn); +early_initcall(init_zero_page_pfn); void __init __weak arch_mm_preinit(void) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 927086bb4a3c..e19872e51878 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -357,7 +357,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, if (mm_forbids_zeropage(dst_vma->vm_mm)) return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); - _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); @@ -1229,7 +1229,7 @@ static int move_zeropage_pte(struct mm_struct *mm, return -EAGAIN; } - zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), dst_vma->vm_page_prot)); ptep_clear_flush(src_vma, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, zero_pte); From 6215d9f4470fbb48245ffdfade821685e2728c65 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:40 +0200 Subject: [PATCH 064/369] arch, mm: consolidate empty_zero_page Reduce 22 declarations of empty_zero_page to 3 and 23 declarations of ZERO_PAGE() to 4. Every architecture defines empty_zero_page that way or another, but for the most of them it is always a page aligned page in BSS and most definitions of ZERO_PAGE do virt_to_page(empty_zero_page). Move Linus vetted x86 definition of empty_zero_page and ZERO_PAGE() to the core MM and drop these definitions in architectures that do not implement colored zero page (MIPS and s390). ZERO_PAGE() remains a macro because turning it to a wrapper for a static inline causes severe pain in header dependencies. For the most part the change is mechanical, with these being noteworthy: * alpha: aliased empty_zero_page with ZERO_PGE that was also used for boot parameters. Switching to a generic empty_zero_page removes the aliasing and keeps ZERO_PGE for boot parameters only * arm64: uses __pa_symbol() in ZERO_PAGE() so that definition of ZERO_PAGE() is kept intact. * m68k/parisc/um: allocated empty_zero_page from memblock, although they do not support zero page coloring and having it in BSS will work fine. * sparc64 can have empty_zero_page in BSS rather allocate it, but it can't use virt_to_page() for BSS. Keep it's definition of ZERO_PAGE() but instead of allocating it, make mem_map_zero point to empty_zero_page. * sh: used empty_zero_page for boot parameters at the very early boot. Rename the parameters page to boot_params_page and let sh use the generic empty_zero_page. * hexagon: had an amusing comment about empty_zero_page /* A handy thing to have if one has the RAM. Declared in head.S */ that unfortunately had to go :) Link: https://lkml.kernel.org/r/20260211103141.3215197-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Helge Deller [parisc] Tested-by: Helge Deller [parisc] Reviewed-by: Christophe Leroy (CS GROUP) Acked-by: Dave Hansen Acked-by: Catalin Marinas Acked-by: Magnus Lindholm [alpha] Acked-by: Dinh Nguyen [nios2] Acked-by: Andreas Larsson [sparc] Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: "Borislav Petkov (AMD)" Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 6 ------ arch/arc/include/asm/pgtable.h | 3 --- arch/arc/mm/init.c | 2 -- arch/arm/include/asm/pgtable.h | 9 --------- arch/arm/mm/mmu.c | 7 ------- arch/arm/mm/nommu.c | 7 ------- arch/arm64/include/asm/pgtable.h | 1 - arch/arm64/mm/mmu.c | 7 ------- arch/csky/include/asm/pgtable.h | 3 --- arch/csky/mm/init.c | 3 --- arch/hexagon/include/asm/pgtable.h | 6 ------ arch/hexagon/kernel/head.S | 5 ----- arch/hexagon/kernel/hexagon_ksyms.c | 1 - arch/loongarch/include/asm/pgtable.h | 9 --------- arch/loongarch/mm/init.c | 3 --- arch/m68k/include/asm/pgtable_mm.h | 9 --------- arch/m68k/include/asm/pgtable_no.h | 7 ------- arch/m68k/mm/init.c | 9 --------- arch/m68k/mm/mcfmmu.c | 2 -- arch/m68k/mm/motorola.c | 6 ------ arch/m68k/mm/sun3mmu.c | 2 -- arch/microblaze/include/asm/pgtable.h | 10 ---------- arch/microblaze/kernel/head.S | 4 ---- arch/microblaze/kernel/microblaze_ksyms.c | 2 -- arch/nios2/include/asm/pgtable.h | 7 ------- arch/nios2/kernel/head.S | 10 ---------- arch/nios2/kernel/nios2_ksyms.c | 1 - arch/openrisc/include/asm/pgtable.h | 4 ---- arch/openrisc/kernel/head.S | 3 --- arch/openrisc/kernel/or32_ksyms.c | 1 - arch/openrisc/mm/init.c | 3 --- arch/parisc/include/asm/pgtable.h | 11 ----------- arch/parisc/mm/init.c | 6 ------ arch/powerpc/include/asm/pgtable.h | 6 ------ arch/powerpc/mm/mem.c | 3 --- arch/riscv/include/asm/pgtable.h | 7 ------- arch/riscv/mm/init.c | 4 ---- arch/sh/include/asm/pgtable.h | 8 -------- arch/sh/include/asm/setup.h | 3 ++- arch/sh/kernel/head_32.S | 4 ++-- arch/sh/kernel/sh_ksyms_32.c | 1 - arch/sh/mm/init.c | 1 - arch/sparc/include/asm/pgtable_32.h | 8 -------- arch/sparc/include/asm/setup.h | 2 -- arch/sparc/kernel/head_32.S | 7 ------- arch/sparc/mm/init_32.c | 4 ---- arch/sparc/mm/init_64.c | 11 ++++------- arch/um/include/asm/pgtable.h | 9 --------- arch/um/include/shared/kern_util.h | 1 - arch/um/kernel/mem.c | 16 ---------------- arch/um/kernel/um_arch.c | 1 - arch/x86/include/asm/pgtable.h | 8 -------- arch/x86/kernel/head_32.S | 4 ---- arch/x86/kernel/head_64.S | 7 ------- arch/xtensa/include/asm/pgtable.h | 4 ---- arch/xtensa/kernel/head.S | 3 --- arch/xtensa/kernel/xtensa_ksyms.c | 2 -- include/linux/pgtable.h | 10 ++++++++++ mm/mm_init.c | 5 +++++ 59 files changed, 23 insertions(+), 285 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index e9368c54be45..268ddde33617 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -126,12 +126,6 @@ struct vm_area_struct; */ #define pgprot_noncached(prot) (prot) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) (virt_to_page(ZERO_PGE)) - /* * On certain platforms whose physical address space can overlap KSEG, * namely EV6 and above, we must re-twiddle the physaddr to restore the diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index bd580e2b62d7..0fdaea81b5fa 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -21,9 +21,6 @@ #ifndef __ASSEMBLER__ -extern char empty_zero_page[PAGE_SIZE]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); /* to cope with aliasing VIPT cache */ diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index a5e92f46e5d1..d6b5c27a0098 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -19,8 +19,6 @@ #include pgd_t swapper_pg_dir[PTRS_PER_PGD] __aligned(PAGE_SIZE); -char empty_zero_page[PAGE_SIZE] __aligned(PAGE_SIZE); -EXPORT_SYMBOL(empty_zero_page); static const unsigned long low_mem_start = CONFIG_LINUX_RAM_BASE; static unsigned long low_mem_sz; diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 6fa9acd6a7f5..982795cf4563 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -10,15 +10,6 @@ #include #include -#ifndef __ASSEMBLY__ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif - #include #ifndef CONFIG_MMU diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 518def8314e7..23b87b5ef7f1 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -41,13 +41,6 @@ extern unsigned long __atags_pointer; -/* - * empty_zero_page is a special page that is used for - * zero-initialized data and COW. - */ -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - /* * The pmd table for the upper-most set of pages. */ diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 7e42d8accec6..040ea43cce32 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -27,13 +27,6 @@ unsigned long vectors_base; -/* - * empty_zero_page is a special page that is used for - * zero-initialized data and COW. - */ -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - #ifdef CONFIG_ARM_MPU struct mpu_rgn_info mpu_rgn_info; #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b3e58735c49b..769570e43c18 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -110,7 +110,6 @@ static inline void arch_leave_lazy_mmu_mode(void) * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page)) #define pte_ERROR(e) \ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a12ea8776c32..ec932f6ccddc 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -64,13 +64,6 @@ static bool rodata_is_rw __ro_after_init = true; */ long __section(".mmuoff.data.write") __early_cpu_boot_status; -/* - * Empty_zero_page is a special page that is used for zero-initialized data - * and COW. - */ -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - static DEFINE_SPINLOCK(swapper_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index d606afbabce1..bafcd5823531 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -76,9 +76,6 @@ #define MAX_SWAPFILES_CHECK() \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT != 5) -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern void load_pgd(unsigned long pg_dir); extern pte_t invalid_pte_table[PTRS_PER_PTE]; diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index 573da66b2543..fa16015ea1c0 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -38,9 +38,6 @@ pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss; pte_t kernel_pte_tables[PTRS_KERN_TABLE] __page_aligned_bss; EXPORT_SYMBOL(invalid_pte_table); -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] - __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); void free_initmem(void) { diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index fbf24d1d1ca6..27b269e2870d 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -14,9 +14,6 @@ #include #include -/* A handy thing to have if one has the RAM. Declared in head.S */ -extern unsigned long empty_zero_page; - /* * The PTE model described here is that of the Hexagon Virtual Machine, * which autonomously walks 2-level page tables. At a lower level, we @@ -348,9 +345,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) return (unsigned long)__va(pmd_val(pmd) & PAGE_MASK); } -/* ZERO_PAGE - returns the globally shared zero page */ -#define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) - /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that * are !pte_none() && !pte_present(). diff --git a/arch/hexagon/kernel/head.S b/arch/hexagon/kernel/head.S index 0b016308cc79..908ffece9132 100644 --- a/arch/hexagon/kernel/head.S +++ b/arch/hexagon/kernel/head.S @@ -216,8 +216,3 @@ __head_s_vaddr_target: .p2align PAGE_SHIFT ENTRY(external_cmdline_buffer) .fill _PAGE_SIZE,1,0 - -.data -.p2align PAGE_SHIFT -ENTRY(empty_zero_page) - .fill _PAGE_SIZE,1,0 diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c index 36a80e31d187..81bc6f81e200 100644 --- a/arch/hexagon/kernel/hexagon_ksyms.c +++ b/arch/hexagon/kernel/hexagon_ksyms.c @@ -17,7 +17,6 @@ EXPORT_SYMBOL(raw_copy_to_user); EXPORT_SYMBOL(__vmgetie); EXPORT_SYMBOL(__vmsetie); EXPORT_SYMBOL(__vmyield); -EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index c33b3bcb733e..a244de27a03e 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -74,15 +74,6 @@ struct mm_struct; struct vm_area_struct; -/* - * ZERO_PAGE is a global shared page that is always zero; used - * for zero-mapped memory areas etc.. - */ - -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; - -#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) - #ifdef CONFIG_32BIT #define VMALLOC_START (vm_map_base + PCI_IOSIZE + (2 * PAGE_SIZE)) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index c331bf69d2ec..00f3822b6e47 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -36,9 +36,6 @@ #include #include -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index bba64a9c49ac..7501ff030c63 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -110,15 +110,6 @@ extern unsigned long m68k_vmalloc_end; #define VMALLOC_END KMAP_START #endif -/* zero page used for uninitialized stuff */ -extern void *empty_zero_page; - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern void kernel_set_cachemode(void *addr, unsigned long size, int cmode); /* diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index 1a86c15b9008..11751807a3f3 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -30,13 +30,6 @@ #define swapper_pg_dir ((pgd_t *) 0) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern void *empty_zero_page; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 53b71f786c27..3b88c0dd1616 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -33,13 +33,6 @@ #include #include -/* - * ZERO_PAGE is a special page that is used for zero-initialized - * data and COW. - */ -void *empty_zero_page; -EXPORT_SYMBOL(empty_zero_page); - void __init arch_zone_limits_init(unsigned long *max_zone_pfns) { max_zone_pfns[ZONE_DMA] = PFN_DOWN(memblock_end_of_DRAM()); @@ -71,8 +64,6 @@ void __init paging_init(void) unsigned long end_mem = memory_end & PAGE_MASK; high_memory = (void *) end_mem; - - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } #endif /* CONFIG_MMU */ diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 3418fd864237..4924f2ff8ef8 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -41,8 +41,6 @@ void __init paging_init(void) unsigned long next_pgtable; int i; - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - pg_dir = swapper_pg_dir; memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 127a3fa69f4c..b30aa69a73a6 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -498,12 +498,6 @@ void __init paging_init(void) early_memtest(min_addr, max_addr); - /* - * initialize the bad page table and bad page to point - * to a couple of allocated pages - */ - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - /* * Set up SFC/DFC registers */ diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index c801677f7df8..f139cc15753a 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -43,8 +43,6 @@ void __init paging_init(void) unsigned long bootmem_end; unsigned long size; - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - address = PAGE_OFFSET; pg_dir = swapper_pg_dir; memset (swapper_pg_dir, 0, sizeof (swapper_pg_dir)); diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 4eb76de6be4a..ea72291de553 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -207,16 +207,6 @@ extern pte_t *va_to_pte(unsigned long address); * Also, write permissions imply read permissions. */ -#ifndef __ASSEMBLER__ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[1024]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - -#endif /* __ASSEMBLER__ */ - #define pte_none(pte) ((pte_val(pte) & ~_PTE_NONE_MASK) == 0) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) #define pte_clear(mm, addr, ptep) \ diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S index ec2fcb545e64..808019c3b7ac 100644 --- a/arch/microblaze/kernel/head.S +++ b/arch/microblaze/kernel/head.S @@ -39,10 +39,6 @@ #include .section .data -.global empty_zero_page -.align 12 -empty_zero_page: - .space PAGE_SIZE .global swapper_pg_dir swapper_pg_dir: .space PAGE_SIZE diff --git a/arch/microblaze/kernel/microblaze_ksyms.c b/arch/microblaze/kernel/microblaze_ksyms.c index a8553f54152b..ad7596d7ba07 100644 --- a/arch/microblaze/kernel/microblaze_ksyms.c +++ b/arch/microblaze/kernel/microblaze_ksyms.c @@ -33,8 +33,6 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); #endif -EXPORT_SYMBOL(empty_zero_page); - EXPORT_SYMBOL(mbc); extern void __divsi3(void); diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 844dce55569f..d389aa9ca57c 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -65,13 +65,6 @@ struct mm_struct; #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pte_t invalid_pte_table[PAGE_SIZE/sizeof(pte_t)]; diff --git a/arch/nios2/kernel/head.S b/arch/nios2/kernel/head.S index 372ce4a33018..613212e1a63a 100644 --- a/arch/nios2/kernel/head.S +++ b/arch/nios2/kernel/head.S @@ -23,16 +23,6 @@ #include #include -/* - * ZERO_PAGE is a special page that is used for zero-initialized - * data and COW. - */ -.data -.global empty_zero_page -.align 12 -empty_zero_page: - .space PAGE_SIZE - /* * This global variable is used as an extension to the nios' * STATUS register to emulate a user/supervisor mode. diff --git a/arch/nios2/kernel/nios2_ksyms.c b/arch/nios2/kernel/nios2_ksyms.c index 54f7b23df1bf..c40aa39e8658 100644 --- a/arch/nios2/kernel/nios2_ksyms.c +++ b/arch/nios2/kernel/nios2_ksyms.c @@ -20,7 +20,6 @@ EXPORT_SYMBOL(memmove); /* memory management */ -EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(flush_icache_range); /* diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index b218050e2f6d..6b89996d0b62 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -179,10 +179,6 @@ extern void paging_init(void); __pgprot(_PAGE_ALL | _PAGE_SRE | _PAGE_SWE \ | _PAGE_SHARED | _PAGE_DIRTY | _PAGE_EXEC | _PAGE_CI) -/* zero page used for uninitialized stuff */ -extern unsigned long empty_zero_page[2048]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #define pte_none(x) (!pte_val(x)) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_clear(mm, addr, xp) do { pte_val(*(xp)) = 0; } while (0) diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index bd760066f1cd..45890393947d 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -1563,9 +1563,6 @@ _string_nl: */ .section .data,"aw" .align 8192 - .global empty_zero_page -empty_zero_page: - .space 8192 .global swapper_pg_dir swapper_pg_dir: diff --git a/arch/openrisc/kernel/or32_ksyms.c b/arch/openrisc/kernel/or32_ksyms.c index 212e5f85004c..84a937a64e2a 100644 --- a/arch/openrisc/kernel/or32_ksyms.c +++ b/arch/openrisc/kernel/or32_ksyms.c @@ -40,7 +40,6 @@ DECLARE_EXPORT(__ashldi3); DECLARE_EXPORT(__lshrdi3); DECLARE_EXPORT(__ucmpdi2); -EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(__copy_tofrom_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(memset); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 78fb0734cdbc..89d8c6df8855 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -188,9 +188,6 @@ void __init mem_init(void) { BUG_ON(!mem_map); - /* clear the zero-page */ - memset((void *)empty_zero_page, 0, PAGE_SIZE); - printk("mem_init_done ...........................................\n"); mem_init_done = 1; return; diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 17afe7a59edf..f6fb99cb94d9 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -262,17 +262,6 @@ extern pgd_t swapper_pg_dir[]; /* declared in init_task.c */ extern pte_t pg0[]; -/* zero page used for uninitialized stuff */ - -extern unsigned long *empty_zero_page; - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ - -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #define pte_none(x) (pte_val(x) == 0) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_user(x) (pte_val(x) & _PAGE_USER) diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 6a39e031e5ff..be3380c9bcda 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -604,9 +604,6 @@ void __init mem_init(void) #endif } -unsigned long *empty_zero_page __ro_after_init; -EXPORT_SYMBOL(empty_zero_page); - /* * pagetable_init() sets up the page tables * @@ -639,9 +636,6 @@ static void __init pagetable_init(void) initrd_end - initrd_start, PAGE_KERNEL, 0); } #endif - - empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - } static void __init gateway_init(void) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index dcd3a88caaf6..b27d94c06d0e 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -76,12 +76,6 @@ static inline const void *pmd_page_vaddr(pmd_t pmd) } #define pmd_page_vaddr pmd_page_vaddr #endif -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern pgd_t swapper_pg_dir[]; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index b7982d0243d4..648d0c5602ec 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -38,9 +38,6 @@ unsigned long long memory_limit __initdata; -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - pgprot_t __phys_mem_access_prot(unsigned long pfn, unsigned long size, pgprot_t vma_prot) { diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 08d1ca047104..ab4ce1cc9d9c 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1284,13 +1284,6 @@ extern u64 satp_mode; void paging_init(void); void misc_mem_init(void); -/* - * ZERO_PAGE is a global shared page that is always zero, - * used for zero-mapped memory areas, etc. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - /* * Use set_p*_safe(), and elide TLB flushing, when confident that *no* * TLB flush will be required as a result of the "set". For example, use diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 811e03786c56..017bad735d47 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -69,10 +69,6 @@ unsigned long vmemmap_start_pfn __ro_after_init; EXPORT_SYMBOL(vmemmap_start_pfn); #endif -unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] - __page_aligned_bss; -EXPORT_SYMBOL(empty_zero_page); - extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 10fa8f2bb8d1..d5ce0950a323 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -20,14 +20,6 @@ #ifndef __ASSEMBLER__ #include #include - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #endif /* !__ASSEMBLER__ */ /* diff --git a/arch/sh/include/asm/setup.h b/arch/sh/include/asm/setup.h index 84bb23a771f3..63c9efc06348 100644 --- a/arch/sh/include/asm/setup.h +++ b/arch/sh/include/asm/setup.h @@ -7,7 +7,8 @@ /* * This is set up by the setup-routine at boot-time */ -#define PARAM ((unsigned char *)empty_zero_page) +extern unsigned char *boot_params_page; +#define PARAM boot_params_page #define MOUNT_ROOT_RDONLY (*(unsigned long *) (PARAM+0x000)) #define RAMDISK_FLAGS (*(unsigned long *) (PARAM+0x004)) diff --git a/arch/sh/kernel/head_32.S b/arch/sh/kernel/head_32.S index b603b7968b38..0b91bb85d40a 100644 --- a/arch/sh/kernel/head_32.S +++ b/arch/sh/kernel/head_32.S @@ -26,7 +26,7 @@ #endif .section .empty_zero_page, "aw" -ENTRY(empty_zero_page) +ENTRY(boot_params_page) .long 1 /* MOUNT_ROOT_RDONLY */ .long 0 /* RAMDISK_FLAGS */ .long 0x0200 /* ORIG_ROOT_DEV */ @@ -39,7 +39,7 @@ ENTRY(empty_zero_page) .long 0x53453f00 + 29 /* "SE?" = 29 bit */ #endif 1: - .skip PAGE_SIZE - empty_zero_page - 1b + .skip PAGE_SIZE - boot_params_page - 1b __HEAD diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c index 5858936cb431..041191002e2e 100644 --- a/arch/sh/kernel/sh_ksyms_32.c +++ b/arch/sh/kernel/sh_ksyms_32.c @@ -20,7 +20,6 @@ EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(empty_zero_page); #ifdef CONFIG_FLATMEM /* need in pfn_valid macro */ EXPORT_SYMBOL(min_low_pfn); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 464a3a63e2fa..4e40d5e96be9 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -332,7 +332,6 @@ void __init mem_init(void) cpu_cache_init(); /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); __flush_wback_region(empty_zero_page, PAGE_SIZE); vsyscall_init(); diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index a9f802d1dd64..f89b1250661d 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -71,14 +71,6 @@ extern unsigned long ptr_in_current_pgd; extern unsigned long phys_base; extern unsigned long pfn_base; -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; - -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - /* * In general all page table modifications should use the V8 atomic * swap instruction. This insures the mmu and the cpu are in sync diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index 72205684e51e..21bed5514028 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -17,8 +17,6 @@ extern char reboot_command[]; */ extern unsigned char boot_cpu_id; -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; - extern int serial_console; static inline int con_is_present(void) { diff --git a/arch/sparc/kernel/head_32.S b/arch/sparc/kernel/head_32.S index 38345460d542..8c320fa25a67 100644 --- a/arch/sparc/kernel/head_32.S +++ b/arch/sparc/kernel/head_32.S @@ -57,13 +57,6 @@ sun4e_notsup: .align PAGE_SIZE -/* This was the only reasonable way I could think of to properly align - * these page-table data structures. - */ - .globl empty_zero_page -empty_zero_page: .skip PAGE_SIZE -EXPORT_SYMBOL(empty_zero_page) - .global root_flags .global ram_flags .global root_dev diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index fdc93dd12c3e..e0e66f91ceeb 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -246,10 +246,6 @@ void __init arch_mm_preinit(void) prom_halt(); } - - /* Saves us work later. */ - memset((void *)empty_zero_page, 0, PAGE_SIZE); - i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5); i += 1; sparc_valid_addr_bitmap = (unsigned long *) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index f46394c46a76..748790998ff5 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2492,6 +2492,9 @@ static void __init register_page_bootmem_info(void) } void __init mem_init(void) { + phys_addr_t zero_page_pa = kern_base + + ((unsigned long)&empty_zero_page[0] - KERNBASE); + /* * Must be done after boot memory is put on freelist, because here we * might set fields in deferred struct pages that have not yet been @@ -2504,13 +2507,7 @@ void __init mem_init(void) * Set up the zero page, mark it reserved, so that page count * is not manipulated when freeing the page from user ptes. */ - mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); - if (mem_map_zero == NULL) { - prom_printf("paging_init: Cannot alloc zero page.\n"); - prom_halt(); - } - mark_page_reserved(mem_map_zero); - + mem_map_zero = pfn_to_page(PHYS_PFN(zero_page_pa)); if (tlb_type == cheetah || tlb_type == cheetah_plus) cheetah_ecache_flush_init(); diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 3b42b0f45bf6..19e0608fb649 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -34,9 +34,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; -/* zero page used for uninitialized stuff */ -extern unsigned long *empty_zero_page; - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that @@ -74,12 +71,6 @@ extern unsigned long *empty_zero_page; * get.. */ -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) - #define pte_clear(mm, addr, xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEEDSYNC)) #define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEEDSYNC)) diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 38321188c04c..9812efd14ec0 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -38,7 +38,6 @@ extern void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs extern void uml_pm_wake(void); extern int start_uml(void); -extern void paging_init(void); extern void uml_cleanup(void); extern void do_uml_exitcalls(void); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 89c8c8b94a79..1eef0e42ef5d 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -44,10 +44,6 @@ __section(".kasan_init") __used = kasan_init; #endif -/* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ -unsigned long *empty_zero_page = NULL; -EXPORT_SYMBOL(empty_zero_page); - /* * Initialized during boot, and readonly for initializing page tables * afterwards @@ -65,9 +61,6 @@ void __init arch_mm_preinit(void) /* Safe to call after jump_label_init(). Enables KASAN. */ kasan_init_generic(); - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - /* Map in the area just after the brk now that kmalloc is about * to be turned on. */ @@ -89,15 +82,6 @@ void __init arch_zone_limits_init(unsigned long *max_zone_pfns) max_zone_pfns[ZONE_NORMAL] = high_physmem >> PAGE_SHIFT; } -void __init paging_init(void) -{ - empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, - PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); -} - /* * This can't do anything because nothing in the kernel image can be freed * since it's not in kernel physical memory. diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index e2b24e1ecfa6..2141f5f1f5a2 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -413,7 +413,6 @@ void __init setup_arch(char **cmdline_p) uml_dtb_init(); read_initrd(); - paging_init(); strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; setup_hostinfo(host_info, sizeof host_info); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1662c5a8f445..54289f4587a4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -47,14 +47,6 @@ void ptdump_walk_user_pgd_level_checkwx(void); #define debug_checkwx_user() do { } while (0) #endif -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] - __visible; -#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) - extern spinlock_t pgd_lock; extern struct list_head pgd_list; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 80ef5d386b03..5171cb746444 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -441,10 +441,6 @@ initial_pg_fixmap: swapper_pg_dir: .fill 1024,4,0 .fill PTI_USER_PGD_FILL,4,0 -.globl empty_zero_page -empty_zero_page: - .fill 4096,1,0 -EXPORT_SYMBOL(empty_zero_page) /* * This starts the data section. diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 85d4a5094f6b..7ed5520dd52e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -684,10 +684,3 @@ SYM_PIC_ALIAS(phys_base); EXPORT_SYMBOL(phys_base) #include "../xen/xen-head.S" - - __PAGE_ALIGNED_BSS -SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) - .skip PAGE_SIZE -SYM_DATA_END(empty_zero_page) -EXPORT_SYMBOL(empty_zero_page) - diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 50a136213b2b..61f07d981a94 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -209,10 +209,6 @@ #define pgd_ERROR(e) \ printk("%s:%d: bad pgd entry %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -extern unsigned long empty_zero_page[1024]; - -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - #ifdef CONFIG_MMU extern pgd_t swapper_pg_dir[PAGE_SIZE/sizeof(pgd_t)]; extern void paging_init(void); diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index 8484294bc623..4b0c5c5e685a 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -381,6 +381,3 @@ ENTRY(swapper_pg_dir) .fill PAGE_SIZE, 1, 0 END(swapper_pg_dir) #endif -ENTRY(empty_zero_page) - .fill PAGE_SIZE, 1, 0 -END(empty_zero_page) diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c index 62d81e76e18e..ced335b4df5f 100644 --- a/arch/xtensa/kernel/xtensa_ksyms.c +++ b/arch/xtensa/kernel/xtensa_ksyms.c @@ -15,8 +15,6 @@ #include #include -EXPORT_SYMBOL(empty_zero_page); - unsigned int __sync_fetch_and_and_4(volatile void *p, unsigned int v) { BUG(); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c3a56f6b1ea5..2a05c3885f85 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1925,6 +1925,9 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) * for different ranges in the virtual address space. * * zero_page_pfn identifies the first (or the only) pfn for these pages. + * + * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in + * empty_zero_page in BSS. */ #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) @@ -1951,6 +1954,13 @@ static inline unsigned long zero_pfn(unsigned long addr) return zero_page_pfn; } + +extern uint8_t empty_zero_page[PAGE_SIZE]; + +#ifndef ZERO_PAGE +#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) +#endif + #endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_MMU diff --git a/mm/mm_init.c b/mm/mm_init.c index ab6578516dd6..a0472d496c91 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -56,6 +56,11 @@ EXPORT_SYMBOL(high_memory); unsigned long zero_page_pfn __ro_after_init; EXPORT_SYMBOL(zero_page_pfn); +#ifndef __HAVE_COLOR_ZERO_PAGE +uint8_t empty_zero_page[PAGE_SIZE] __page_aligned_bss; +EXPORT_SYMBOL(empty_zero_page); +#endif + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; From 26513781d1b3a1e8b4b576ed62751d604a69b374 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Feb 2026 12:31:41 +0200 Subject: [PATCH 065/369] mm: cache struct page for empty_zero_page and return it from ZERO_PAGE() For most architectures every invocation of ZERO_PAGE() does virt_to_page(empty_zero_page). But empty_zero_page is in BSS and it is enough to get its struct page once at initialization time and then use it whenever a zero page should be accessed. Add yet another __zero_page variable that will be initialized as virt_to_page(empty_zero_page) for most architectures in a weak arch_setup_zero_pages() function. For architectures that use colored zero pages (MIPS and s390) rename their setup_zero_pages() to arch_setup_zero_pages() and make it global rather than static. For architectures that cannot use virt_to_page() for BSS (arm64 and sparc64) add override of arch_setup_zero_pages(). Link: https://lkml.kernel.org/r/20260211103141.3215197-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Catalin Marinas Acked-by: David Hildenbrand (Arm) Acked-by: Liam R. Howlett Cc: Andreas Larsson Cc: "Borislav Petkov (AMD)" Cc: Christophe Leroy (CS GROUP) Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Lorenzo Stoakes Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russell King Cc: Stafford Horne Cc: Suren Baghdasaryan Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 6 ------ arch/arm64/mm/init.c | 5 +++++ arch/mips/mm/init.c | 11 +---------- arch/s390/mm/init.c | 4 +--- arch/sparc/include/asm/pgtable_64.h | 3 --- arch/sparc/mm/init_64.c | 17 +++++++---------- include/linux/pgtable.h | 11 ++++++++--- mm/mm_init.c | 23 ++++++++++++++++++----- 8 files changed, 40 insertions(+), 40 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 769570e43c18..aa4b13da6371 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -106,12 +106,6 @@ static inline void arch_leave_lazy_mmu_mode(void) #define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \ local_flush_tlb_page_nonotify(vma, address) -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page)) - #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e)) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 96711b8578fd..417ec7efe569 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -328,6 +328,11 @@ void __init bootmem_init(void) memblock_dump_all(); } +void __init arch_setup_zero_pages(void) +{ + __zero_page = phys_to_page(__pa_symbol(empty_zero_page)); +} + void __init arch_mm_preinit(void) { unsigned int flags = SWIOTLB_VERBOSE; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 4f6449ad02ca..55b25e85122a 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -56,10 +56,7 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL_GPL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -/* - * Not static inline because used by IP27 special magic initialization code - */ -static void __init setup_zero_pages(void) +void __init arch_setup_zero_pages(void) { unsigned int order; @@ -450,7 +447,6 @@ void __init arch_mm_preinit(void) BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (PFN_PTE_SHIFT > PAGE_SHIFT)); maar_init(); - setup_zero_pages(); /* Setup zeroed pages. */ highmem_init(); #ifdef CONFIG_64BIT @@ -461,11 +457,6 @@ void __init arch_mm_preinit(void) 0x80000000 - 4, KCORE_TEXT); #endif } -#else /* CONFIG_NUMA */ -void __init arch_mm_preinit(void) -{ - setup_zero_pages(); /* This comes from node 0 */ -} #endif /* !CONFIG_NUMA */ void free_init_pages(const char *what, unsigned long begin, unsigned long end) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3c20475cbee2..1f72efc2a579 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -69,7 +69,7 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -static void __init setup_zero_pages(void) +void __init arch_setup_zero_pages(void) { unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; @@ -159,8 +159,6 @@ void __init arch_mm_preinit(void) cpumask_set_cpu(0, mm_cpumask(&init_mm)); pv_init(); - - setup_zero_pages(); /* Setup zeroed pages. */ } unsigned long memory_block_size_bytes(void) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 615f460c50af..74ede706fb32 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -210,9 +210,6 @@ extern unsigned long _PAGE_CACHE; extern unsigned long pg_iobits; extern unsigned long _PAGE_ALL_SZ_BITS; -extern struct page *mem_map_zero; -#define ZERO_PAGE(vaddr) (mem_map_zero) - /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 748790998ff5..3aa47f2b6c6e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -177,9 +177,6 @@ extern unsigned long sparc_ramdisk_image64; extern unsigned int sparc_ramdisk_image; extern unsigned int sparc_ramdisk_size; -struct page *mem_map_zero __read_mostly; -EXPORT_SYMBOL(mem_map_zero); - unsigned int sparc64_highest_unlocked_tlb_ent __read_mostly; unsigned long sparc64_kern_pri_context __read_mostly; @@ -2490,11 +2487,17 @@ static void __init register_page_bootmem_info(void) register_page_bootmem_info_node(NODE_DATA(i)); #endif } -void __init mem_init(void) + +void __init arch_setup_zero_pages(void) { phys_addr_t zero_page_pa = kern_base + ((unsigned long)&empty_zero_page[0] - KERNBASE); + __zero_page = phys_to_page(zero_page_pa); +} + +void __init mem_init(void) +{ /* * Must be done after boot memory is put on freelist, because here we * might set fields in deferred struct pages that have not yet been @@ -2503,12 +2506,6 @@ void __init mem_init(void) */ register_page_bootmem_info(); - /* - * Set up the zero page, mark it reserved, so that page count - * is not manipulated when freeing the page from user ptes. - */ - mem_map_zero = pfn_to_page(PHYS_PFN(zero_page_pa)); - if (tlb_type == cheetah || tlb_type == cheetah_plus) cheetah_ecache_flush_init(); } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2a05c3885f85..776993d4567b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1929,6 +1929,8 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in * empty_zero_page in BSS. */ +void arch_setup_zero_pages(void); + #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1956,10 +1958,13 @@ static inline unsigned long zero_pfn(unsigned long addr) } extern uint8_t empty_zero_page[PAGE_SIZE]; +extern struct page *__zero_page; -#ifndef ZERO_PAGE -#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) -#endif +static inline struct page *_zero_page(unsigned long addr) +{ + return __zero_page; +} +#define ZERO_PAGE(vaddr) _zero_page(vaddr) #endif /* __HAVE_COLOR_ZERO_PAGE */ diff --git a/mm/mm_init.c b/mm/mm_init.c index a0472d496c91..f903747ca854 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -59,7 +59,10 @@ EXPORT_SYMBOL(zero_page_pfn); #ifndef __HAVE_COLOR_ZERO_PAGE uint8_t empty_zero_page[PAGE_SIZE] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -#endif + +struct page *__zero_page __ro_after_init; +EXPORT_SYMBOL(__zero_page); +#endif /* __HAVE_COLOR_ZERO_PAGE */ #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -2680,12 +2683,21 @@ static void __init mem_init_print_info(void) ); } -static int __init init_zero_page_pfn(void) +#ifndef __HAVE_COLOR_ZERO_PAGE +/* + * architectures that __HAVE_COLOR_ZERO_PAGE must define this function + */ +void __init __weak arch_setup_zero_pages(void) { - zero_page_pfn = page_to_pfn(ZERO_PAGE(0)); - return 0; + __zero_page = virt_to_page(empty_zero_page); +} +#endif + +static void __init init_zero_page_pfn(void) +{ + arch_setup_zero_pages(); + zero_page_pfn = page_to_pfn(ZERO_PAGE(0)); } -early_initcall(init_zero_page_pfn); void __init __weak arch_mm_preinit(void) { @@ -2709,6 +2721,7 @@ void __init mm_core_init_early(void) void __init mm_core_init(void) { arch_mm_preinit(); + init_zero_page_pfn(); /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); From 15c578d0dc9952c634f59fcdf9f75be0e42ca834 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 11 Feb 2026 14:43:11 +0800 Subject: [PATCH 066/369] selftests/mm: remove duplicate include of unistd.h Remove duplicate inclusion of unistd.h in memory-failure.c to clean up redundant code. Link: https://lkml.kernel.org/r/20260211064311.2981726-1-nichen@iscas.ac.cn Signed-off-by: Chen Ni Acked-by: Miaohe Lin Reviewed-by: Liam R. Howlett Reviewed-by: SeongJae Park Reviewed-by: Dev Jain Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/memory-failure.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/mm/memory-failure.c b/tools/testing/selftests/mm/memory-failure.c index 3d9e0b9ffb41..032ed952057c 100644 --- a/tools/testing/selftests/mm/memory-failure.c +++ b/tools/testing/selftests/mm/memory-failure.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include From 80a4bcac69348e32ccb5ab46401ac2a416fcb576 Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 7 Feb 2026 16:16:13 +0800 Subject: [PATCH 067/369] mm: khugepaged: set to next mm direct when mm has MMF_DISABLE_THP_COMPLETELY When an mm with the MMF_DISABLE_THP_COMPLETELY flag is detected during scanning, directly set khugepaged_scan.mm_slot to the next mm_slot, reduce redundant operation. Without this patch, entering khugepaged_scan_mm_slot() next time, we will set khugepaged_scan.mm_slot to the next mm_slot. With this patch, we will directly set khugepaged_scan.mm_slot to the next mm_slot. Link: https://lkml.kernel.org/r/20260207081613.588598-6-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Reviewed-by: Dev Jain Reviewed-by: Barry Song Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1dd3cfca610d..17ab58681032 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2508,9 +2508,9 @@ breakouterloop_mmap_lock: VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or - * if we scanned all vmas of this mm. + * if we scanned all vmas of this mm, or THP got disabled. */ - if (hpage_collapse_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit_or_disable(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find From b0fbe8c3414d26d43f76cc9c4c1ae8eb51a04428 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 4 Feb 2026 03:09:37 -0500 Subject: [PATCH 068/369] mm/mmu_notifiers: use hlist_for_each_entry_srcu() for SRCU list traversal The mmu_notifier_subscriptions list is protected by SRCU. While the current code uses hlist_for_each_entry_rcu() with an explicit SRCU lockdep check, it is more appropriate to use the dedicated hlist_for_each_entry_srcu() macro. This change aligns the code with the preferred kernel API for SRCU-protected lists, improving code clarity and ensuring that the synchronization method is explicitly documented by the iterator name itself. Link: https://lkml.kernel.org/r/20260204080937.2472-1-lirongqing@baidu.com Signed-off-by: Li RongQing Acked-by: SeongJae Park Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mmu_notifier.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index a6cdf3674bdc..2502474b83b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -309,7 +309,7 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) /* * If ->release runs before mmu_notifier_unregister it must be @@ -372,7 +372,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_flush_young) @@ -392,7 +392,7 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_young) @@ -411,7 +411,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->test_young) { @@ -466,7 +466,7 @@ static int mn_hlist_invalidate_range_start( int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { const struct mmu_notifier_ops *ops = subscription->ops; @@ -504,7 +504,7 @@ static int mn_hlist_invalidate_range_start( * notifiers and one or more failed start, any that succeeded * start are expecting their end to be called. Do so now. */ - hlist_for_each_entry_rcu(subscription, &subscriptions->list, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (!subscription->ops->invalidate_range_end) continue; @@ -542,7 +542,7 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + hlist_for_each_entry_srcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->invalidate_range_end) { if (!mmu_notifier_range_blockable(range)) @@ -577,7 +577,7 @@ void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { if (subscription->ops->arch_invalidate_secondary_tlbs) @@ -713,7 +713,7 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) struct mmu_notifier *subscription; spin_lock(&mm->notifier_subscriptions->lock); - hlist_for_each_entry_rcu(subscription, + hlist_for_each_entry_srcu(subscription, &mm->notifier_subscriptions->list, hlist, lockdep_is_held(&mm->notifier_subscriptions->lock)) { if (subscription->ops != ops) From 1c7b8d8a51cc1022bcf6604adf3f1963f8162f3f Mon Sep 17 00:00:00 2001 From: qinyu Date: Tue, 3 Feb 2026 18:26:49 +0800 Subject: [PATCH 069/369] mm/page_idle.c: remove redundant mmu notifier in aging code Now we have mmu_notifier_clear_young immediately follows pmdp_clear_young_notify which internally calls mmu_notifier_clear_young, this is redundant. change it with non-notify variant and keep consistent with ptep aging code. Link: https://lkml.kernel.org/r/20260203102649.2486836-1-qin.yuA@h3c.com Signed-off-by: qinyu Reviewed-by: Baolin Wang Reviewed-by: SeongJae Park Acked-by: David Hildenbrand (arm) Signed-off-by: Andrew Morton --- mm/page_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_idle.c b/mm/page_idle.c index 96bb94c7b6c3..9c67cbac2965 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -74,7 +74,7 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio, pmd_t pmdval = pmdp_get(pvmw.pmd); if (likely(pmd_present(pmdval))) - referenced |= pmdp_clear_young_notify(vma, addr, pvmw.pmd); + referenced |= pmdp_test_and_clear_young(vma, addr, pvmw.pmd); referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); } else { /* unexpected pmd-mapped page? */ From ae1a645def136e23b81330763edb76c554ce6e23 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 2 Feb 2026 01:47:32 +0800 Subject: [PATCH 070/369] mm/zswap: remove SWP_SYNCHRONOUS_IO swapcache bypass workaround Since commit f1879e8a0c60 ("mm, swap: never bypass the swap cache even for SWP_SYNCHRONOUS_IO"), all swap-in operations go through the swap cache, including those from SWP_SYNCHRONOUS_IO devices like zram. Which means the workaround for swap cache bypassing introduced by commit 25cd241408a2 ("mm: zswap: fix data loss on SWP_SYNCHRONOUS_IO devices") is no longer needed. Remove it, but keep the comments that are still helpful. Link: https://lkml.kernel.org/r/20260202-zswap-syncio-cleanup-v1-1-86bb24a64521@tencent.com Signed-off-by: Kairui Song Suggested-by: Yosry Ahmed Reviewed-by: Barry Song Acked-by: Chris Li Acked-by: Yosry Ahmed Acked-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: Baoquan He Cc: Johannes Weiner Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/zswap.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 16b2ef7223e1..0823cadd02b6 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1595,11 +1595,11 @@ int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - bool swapcache = folio_test_swapcache(folio); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; VM_WARN_ON_ONCE(!folio_test_locked(folio)); + VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); if (zswap_never_enabled()) return -ENOENT; @@ -1630,22 +1630,15 @@ int zswap_load(struct folio *folio) count_objcg_events(entry->objcg, ZSWPIN, 1); /* - * When reading into the swapcache, invalidate our entry. The - * swapcache can be the authoritative owner of the page and + * We are reading into the swapcache, invalidate zswap entry. + * The swapcache is the authoritative owner of the page and * its mappings, and the pressure that results from having two * in-memory copies outweighs any benefits of caching the * compression work. - * - * (Most swapins go through the swapcache. The notable - * exception is the singleton fault on SWP_SYNCHRONOUS_IO - * files, which reads into a private page and may free it if - * the fault fails. We remain the primary owner of the entry.) */ - if (swapcache) { - folio_mark_dirty(folio); - xa_erase(tree, offset); - zswap_entry_free(entry); - } + folio_mark_dirty(folio); + xa_erase(tree, offset); + zswap_entry_free(entry); folio_unlock(folio); return 0; From fc9ef2978d440162f507f70cbc351006af9a77d4 Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Fri, 30 Jan 2026 17:37:28 +0800 Subject: [PATCH 071/369] mm/kmemleak: remove unreachable return statement in scan_should_stop() Patch series "mm/kmemleak: Improve scan_should_stop() implementation". This series improves the scan_should_stop() function by addressing code quality issues and enhancing kernel thread detection robustness. This patch (of 2): Remove unreachable "return 0;" statement as all execution paths return before reaching it. No functional change. Link: https://lkml.kernel.org/r/20260130093729.2045858-2-zhongqiu.han@oss.qualcomm.com Signed-off-by: Zhongqiu Han Acked-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d79acf5c5100..2a24d9a4a835 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1507,10 +1507,8 @@ static int scan_should_stop(void) */ if (current->mm) return signal_pending(current); - else - return kthread_should_stop(); - return 0; + return kthread_should_stop(); } /* From c2d9196541fa58de16e219092f56ae507c925cac Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Fri, 30 Jan 2026 17:37:29 +0800 Subject: [PATCH 072/369] mm/kmemleak: use PF_KTHREAD flag to detect kernel threads Replace the current->mm check with PF_KTHREAD flag for more reliable kernel thread detection in scan_should_stop(). The PF_KTHREAD flag is the standard way to identify kernel threads and is not affected by temporary mm borrowing via kthread_use_mm() (although kmemleak does not currently encounter such cases, this makes the code more robust). No functional change. Link: https://lkml.kernel.org/r/20260130093729.2045858-3-zhongqiu.han@oss.qualcomm.com Signed-off-by: Zhongqiu Han Acked-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2a24d9a4a835..fa8201e23222 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1505,10 +1505,10 @@ static int scan_should_stop(void) * This function may be called from either process or kthread context, * hence the need to check for both stop conditions. */ - if (current->mm) - return signal_pending(current); + if (current->flags & PF_KTHREAD) + return kthread_should_stop(); - return kthread_should_stop(); + return signal_pending(current); } /* From 36cec70e4acbae21e39527c1d41083bca148c7c8 Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:15 +0800 Subject: [PATCH 073/369] mm: khugepaged: add trace_mm_khugepaged_scan event Patch series "Improve khugepaged scan logic", v8. This series improves the khugepaged scan logic and reduces CPU consumption by prioritizing scanning tasks that access memory frequently. The following data is traced by bpftrace[1] on a desktop system. After the system has been left idle for 10 minutes upon booting, a lot of SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan by khugepaged. @scan_pmd_status[1]: 1 ## SCAN_SUCCEED @scan_pmd_status[6]: 2 ## SCAN_EXCEED_SHARED_PTE @scan_pmd_status[3]: 142 ## SCAN_PMD_MAPPED @scan_pmd_status[2]: 178 ## SCAN_NO_PTE_TABLE total progress size: 674 MB Total time : 419 seconds ## include khugepaged_scan_sleep_millisecs The khugepaged has below phenomenon: the khugepaged list is scanned in a FIFO manner, as long as the task is not destroyed, 1. the task no longer has memory that can be collapsed into hugepage, continues scan it always. 2. the task at the front of the khugepaged scan list is cold, they are still scanned first. 3. everyone scan at intervals of khugepaged_scan_sleep_millisecs (default 10s). If we always scan the above two cases first, the valid scan will have to wait for a long time. For the first case, when the memory is either SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE or SCAN_PTE_MAPPED_HUGEPAGE [5], just skip it. For the second case, if the user has explicitly informed us via MADV_FREE that these folios will be freed, just skip it only. The below is some performance test results. kernbench results (testing on x86_64 machine): baseline w/o patches test w/ patches Amean user-32 18522.51 ( 0.00%) 18333.64 * 1.02%* Amean syst-32 1137.96 ( 0.00%) 1113.79 * 2.12%* Amean elsp-32 666.04 ( 0.00%) 659.44 * 0.99%* BAmean-95 user-32 18520.01 ( 0.00%) 18323.57 ( 1.06%) BAmean-95 syst-32 1137.68 ( 0.00%) 1110.50 ( 2.39%) BAmean-95 elsp-32 665.92 ( 0.00%) 659.06 ( 1.03%) BAmean-99 user-32 18520.01 ( 0.00%) 18323.57 ( 1.06%) BAmean-99 syst-32 1137.68 ( 0.00%) 1110.50 ( 2.39%) BAmean-99 elsp-32 665.92 ( 0.00%) 659.06 ( 1.03%) Create three task[2]: hot1 -> cold -> hot2. After all three task are created, each allocate memory 128MB. the hot1/hot2 task continuously access 128 MB memory, while the cold task only accesses its memory briefly andthen call madvise(MADV_FREE). Here are the performance test results: (Throughput bigger is better, other smaller is better) Testing on x86_64 machine: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.14 sec | 2.93 sec | -6.69% | | cycles per access | 4.96 | 2.21 | -55.44% | | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% | | dTLB-load-misses | 284814532 | 69597236 | -75.56% | Testing on qemu-system-x86_64 -enable-kvm: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.35 sec | 2.96 sec | -11.64% | | cycles per access | 7.29 | 2.07 | -71.60% | | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% | | dTLB-load-misses | 241600871 | 3216108 | -98.67% | This patch (of 4): Add mm_khugepaged_scan event to track the total time for full scan and the total number of pages scanned of khugepaged. Link: https://lkml.kernel.org/r/20260221093918.1456187-2-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Barry Song Reviewed-by: Lance Yang Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 25 +++++++++++++++++++++++++ mm/khugepaged.c | 2 ++ 2 files changed, 27 insertions(+) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4e41bff31888..384e29f6bef0 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -237,5 +237,30 @@ TRACE_EVENT(mm_khugepaged_collapse_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_scan, + + TP_PROTO(struct mm_struct *mm, unsigned int progress, + bool full_scan_finished), + + TP_ARGS(mm, progress, full_scan_finished), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned int, progress) + __field(bool, full_scan_finished) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->progress = progress; + __entry->full_scan_finished = full_scan_finished; + ), + + TP_printk("mm=%p, progress=%u, full_scan_finished=%d", + __entry->mm, + __entry->progress, + __entry->full_scan_finished) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 17ab58681032..4d7baf220ad9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2527,6 +2527,8 @@ breakouterloop_mmap_lock: collect_mm_slot(slot); } + trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL); + return progress; } From eeeb79d5ed2b89051e8bfb9a4d68721c4a49cc1d Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:16 +0800 Subject: [PATCH 074/369] mm: khugepaged: refine scan progress number Currently, each scan always increases "progress" by HPAGE_PMD_NR, even if only scanning a single PTE/PMD entry. - When only scanning a sigle PTE entry, let me provide a detailed example: static int hpage_collapse_scan_pmd() { for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); ... if (pte_uffd_wp(pteval)) { <-- first scan hit result = SCAN_PTE_UFFD_WP; goto out_unmap; } } } During the first scan, if pte_uffd_wp(pteval) is true, the loop exits directly. In practice, only one PTE is scanned before termination. Here, "progress += 1" reflects the actual number of PTEs scanned, but previously "progress += HPAGE_PMD_NR" always. - When the memory has been collapsed to PMD, let me provide a detailed example: The following data is traced by bpftrace on a desktop system. After the system has been left idle for 10 minutes upon booting, a lot of SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan by khugepaged. From trace_mm_khugepaged_scan_pmd and trace_mm_khugepaged_scan_file, the following statuses were observed, with frequency mentioned next to them: SCAN_SUCCEED : 1 SCAN_EXCEED_SHARED_PTE: 2 SCAN_PMD_MAPPED : 142 SCAN_NO_PTE_TABLE : 178 total progress size : 674 MB Total time : 419 seconds, include khugepaged_scan_sleep_millisecs The khugepaged_scan list save all task that support collapse into hugepage, as long as the task is not destroyed, khugepaged will not remove it from the khugepaged_scan list. This exist a phenomenon where task has already collapsed all memory regions into hugepage, but khugepaged continues to scan it, which wastes CPU time and invalid, and due to khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for scanning a large number of invalid task, so scanning really valid task is later. After applying this patch, when the memory is either SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE, just skip it, as follow: SCAN_EXCEED_SHARED_PTE: 2 SCAN_PMD_MAPPED : 147 SCAN_NO_PTE_TABLE : 173 total progress size : 45 MB Total time : 20 seconds SCAN_PTE_MAPPED_HUGEPAGE is the same, for detailed data, refer to https://lore.kernel.org/linux-mm/4qdu7owpmxfh3ugsue775fxarw5g2gcggbxdf5psj75nnu7z2u@cv2uu2yocaxq Link: https://lkml.kernel.org/r/20260221093918.1456187-3-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (arm) Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4d7baf220ad9..fcdd7b341786 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -68,7 +68,10 @@ enum scan_result { static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ +/* + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas + * every 10 second. + */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; @@ -1231,7 +1234,8 @@ out_nolock: } static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, + struct vm_area_struct *vma, unsigned long start_addr, + bool *mmap_locked, unsigned int *cur_progress, struct collapse_control *cc) { pmd_t *pmd; @@ -1247,19 +1251,27 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); - if (result != SCAN_SUCCEED) + if (result != SCAN_SUCCEED) { + if (cur_progress) + *cur_progress = 1; goto out; + } memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { + if (cur_progress) + *cur_progress = 1; result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { + if (cur_progress) + *cur_progress += 1; + pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { ++none_or_zero; @@ -2279,8 +2291,9 @@ out: return result; } -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, struct collapse_control *cc) +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, + unsigned long addr, struct file *file, pgoff_t start, + unsigned int *cur_progress, struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; @@ -2370,6 +2383,12 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned } } rcu_read_unlock(); + if (cur_progress) { + if (result == SCAN_PTE_MAPPED_HUGEPAGE) + *cur_progress = 1; + else + *cur_progress = HPAGE_PMD_NR; + } if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && @@ -2448,6 +2467,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result while (khugepaged_scan.address < hend) { bool mmap_locked = true; + unsigned int cur_progress = 0; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) @@ -2464,7 +2484,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result mmap_read_unlock(mm); mmap_locked = false; *result = hpage_collapse_scan_file(mm, - khugepaged_scan.address, file, pgoff, cc); + khugepaged_scan.address, file, pgoff, + &cur_progress, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); @@ -2478,7 +2499,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result } } else { *result = hpage_collapse_scan_pmd(mm, vma, - khugepaged_scan.address, &mmap_locked, cc); + khugepaged_scan.address, &mmap_locked, + &cur_progress, cc); } if (*result == SCAN_SUCCEED) @@ -2486,7 +2508,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; - progress += HPAGE_PMD_NR; + progress += cur_progress; if (!mmap_locked) /* * We released mmap_lock so break loop. Note @@ -2809,7 +2831,7 @@ retry: mmap_locked = false; *lock_dropped = true; result = hpage_collapse_scan_file(mm, addr, file, pgoff, - cc); + NULL, cc); if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && mapping_can_writeback(file->f_mapping)) { @@ -2824,7 +2846,7 @@ retry: fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, - &mmap_locked, cc); + &mmap_locked, NULL, cc); } if (!mmap_locked) *lock_dropped = true; From 34c1f77e4a639fd915cf262cba1ac7a29063a68d Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Thu, 26 Feb 2026 22:31:34 +0800 Subject: [PATCH 075/369] mm-khugepaged-refine-scan-progress-number-fix Based on previous discussions [1], v2 as follow, and testing shows the same performance benefits. Just make code cleaner, no function changes. Link: https://lkml.kernel.org/r/hbftflvdmnranprul4zkq3d2iymqm7ta2a7fwiphggsmt36gt7@bihvv5jg2ko5 Link: https://lore.kernel.org/linux-mm/zdvzmoop5xswqcyiwmvvrdfianm4ccs3gryfecwbm4bhuh7ebo@7an4huwgbuwo [1] Signed-off-by: Vernon Yang Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (arm) Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 78 ++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index fcdd7b341786..8ee3c44bc851 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -69,8 +69,8 @@ static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); /* - * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas - * every 10 second. + * default scan 8*HPAGE_PMD_NR ptes, pte_mapped_hugepage, pmd_mapped, + * no_pte_table or vmas every 10 second. */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; @@ -103,6 +103,9 @@ struct collapse_control { /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; + /* Num pages scanned (see khugepaged_pages_to_scan) */ + unsigned int progress; + /* nodemask for allocation fallback */ nodemask_t alloc_nmask; }; @@ -1235,8 +1238,7 @@ out_nolock: static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, - bool *mmap_locked, unsigned int *cur_progress, - struct collapse_control *cc) + bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; @@ -1252,8 +1254,7 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) { - if (cur_progress) - *cur_progress = 1; + cc->progress++; goto out; } @@ -1261,16 +1262,14 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { - if (cur_progress) - *cur_progress = 1; + cc->progress++; result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { - if (cur_progress) - *cur_progress += 1; + cc->progress++; pte_t pteval = ptep_get(_pte); if (pte_none_or_zero(pteval)) { @@ -2293,7 +2292,7 @@ out: static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, - unsigned int *cur_progress, struct collapse_control *cc) + struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; @@ -2383,12 +2382,10 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, } } rcu_read_unlock(); - if (cur_progress) { - if (result == SCAN_PTE_MAPPED_HUGEPAGE) - *cur_progress = 1; - else - *cur_progress = HPAGE_PMD_NR; - } + if (result == SCAN_PTE_MAPPED_HUGEPAGE) + cc->progress++; + else + cc->progress += HPAGE_PMD_NR; if (result == SCAN_SUCCEED) { if (cc->is_khugepaged && @@ -2404,8 +2401,8 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, return result; } -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result *result, - struct collapse_control *cc) +static void khugepaged_scan_mm_slot(unsigned int progress_max, + enum scan_result *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { @@ -2413,9 +2410,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; - int progress = 0; + unsigned int progress_prev = cc->progress; - VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; @@ -2438,7 +2434,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - progress++; + cc->progress++; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; @@ -2448,17 +2444,17 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { - progress++; + cc->progress++; break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { - progress++; + cc->progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) { - progress++; + cc->progress++; continue; } if (khugepaged_scan.address < hstart) @@ -2467,7 +2463,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result while (khugepaged_scan.address < hend) { bool mmap_locked = true; - unsigned int cur_progress = 0; cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) @@ -2484,8 +2479,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result mmap_read_unlock(mm); mmap_locked = false; *result = hpage_collapse_scan_file(mm, - khugepaged_scan.address, file, pgoff, - &cur_progress, cc); + khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); @@ -2499,8 +2493,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result } } else { *result = hpage_collapse_scan_pmd(mm, vma, - khugepaged_scan.address, &mmap_locked, - &cur_progress, cc); + khugepaged_scan.address, &mmap_locked, cc); } if (*result == SCAN_SUCCEED) @@ -2508,7 +2501,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; - progress += cur_progress; if (!mmap_locked) /* * We released mmap_lock so break loop. Note @@ -2518,7 +2510,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result * correct result back to caller. */ goto breakouterloop_mmap_lock; - if (progress >= pages) + if (cc->progress >= progress_max) goto breakouterloop; } } @@ -2549,9 +2541,8 @@ breakouterloop_mmap_lock: collect_mm_slot(slot); } - trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL); - - return progress; + trace_mm_khugepaged_scan(mm, cc->progress - progress_prev, + khugepaged_scan.mm_slot == NULL); } static int khugepaged_has_work(void) @@ -2567,13 +2558,14 @@ static int khugepaged_wait_event(void) static void khugepaged_do_scan(struct collapse_control *cc) { - unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); + const unsigned int progress_max = READ_ONCE(khugepaged_pages_to_scan); + unsigned int pass_through_head = 0; bool wait = true; enum scan_result result = SCAN_SUCCEED; lru_add_drain_all(); + cc->progress = 0; while (true) { cond_resched(); @@ -2585,13 +2577,12 @@ static void khugepaged_do_scan(struct collapse_control *cc) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) - progress += khugepaged_scan_mm_slot(pages - progress, - &result, cc); + khugepaged_scan_mm_slot(progress_max, &result, cc); else - progress = pages; + cc->progress = progress_max; spin_unlock(&khugepaged_mm_lock); - if (progress >= pages) + if (cc->progress >= progress_max) break; if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { @@ -2797,6 +2788,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, if (!cc) return -ENOMEM; cc->is_khugepaged = false; + cc->progress = 0; mmgrab(mm); lru_add_drain_all(); @@ -2831,7 +2823,7 @@ retry: mmap_locked = false; *lock_dropped = true; result = hpage_collapse_scan_file(mm, addr, file, pgoff, - NULL, cc); + cc); if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && mapping_can_writeback(file->f_mapping)) { @@ -2846,7 +2838,7 @@ retry: fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, - &mmap_locked, NULL, cc); + &mmap_locked, cc); } if (!mmap_locked) *lock_dropped = true; From 6cc153f90b7cf07db2b49469dfd79141b145036a Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:17 +0800 Subject: [PATCH 076/369] mm: add folio_test_lazyfree helper Add folio_test_lazyfree() function to identify lazy-free folios to improve code readability. Link: https://lkml.kernel.org/r/20260221093918.1456187-4-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Reviewed-by: Dev Jain Reviewed-by: Barry Song Cc: Baolin Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ mm/rmap.c | 2 +- mm/vmscan.c | 5 ++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f7a0e4af0c73..415e9f2ef616 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -724,6 +724,11 @@ static __always_inline bool folio_test_anon(const struct folio *folio) return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } +static __always_inline bool folio_test_lazyfree(const struct folio *folio) +{ + return folio_test_anon(folio) && !folio_test_swapbacked(folio); +} + static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 8f08090d7eb9..5fd22ba59d35 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2060,7 +2060,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } if (!pvmw.pte) { - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (folio_test_lazyfree(folio)) { if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) goto walk_done; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 031c5c035a82..d531040a3593 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -963,8 +963,7 @@ static void folio_check_dirty_writeback(struct folio *folio, * They could be mistakenly treated as file lru. So further anon * test is needed. */ - if (!folio_is_file_lru(folio) || - (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { + if (!folio_is_file_lru(folio) || folio_test_lazyfree(folio)) { *dirty = false; *writeback = false; return; @@ -1508,7 +1507,7 @@ retry: } } - if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + if (folio_test_lazyfree(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; From 05620419776ab07f1d057bdca5be846f263df1fd Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Sat, 21 Feb 2026 17:39:18 +0800 Subject: [PATCH 077/369] mm: khugepaged: skip lazy-free folios For example, create three task: hot1 -> cold -> hot2. After all three task are created, each allocate memory 128MB. the hot1/hot2 task continuously access 128 MB memory, while the cold task only accesses its memory briefly and then call madvise(MADV_FREE). However, khugepaged still prioritizes scanning the cold task and only scans the hot2 task after completing the scan of the cold task. All folios in VM_DROPPABLE are lazyfree, Collapsing maintains that property, so we can just collapse and memory pressure in the future will free it up. In contrast, collapsing in !VM_DROPPABLE does not maintain that property, the collapsed folio will not be lazyfree and memory pressure in the future will not be able to free it up. So if the user has explicitly informed us via MADV_FREE that this memory will be freed, and this vma does not have VM_DROPPABLE flags, it is appropriate for khugepaged to skip it only, thereby avoiding unnecessary scan and collapse operations to reducing CPU wastage. Here are the performance test results: (Throughput bigger is better, other smaller is better) Testing on x86_64 machine: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.14 sec | 2.93 sec | -6.69% | | cycles per access | 4.96 | 2.21 | -55.44% | | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% | | dTLB-load-misses | 284814532 | 69597236 | -75.56% | Testing on qemu-system-x86_64 -enable-kvm: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.35 sec | 2.96 sec | -11.64% | | cycles per access | 7.29 | 2.07 | -71.60% | | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% | | dTLB-load-misses | 241600871 | 3216108 | -98.67% | [vernon2gm@gmail.com: add comment about VM_DROPPABLE in code, make it clearer] Link: https://lkml.kernel.org/r/i4uowkt4h2ev47obm5h2vtd4zbk6fyw5g364up7kkjn2vmcikq@auepvqethj5r Link: https://lkml.kernel.org/r/20260221093918.1456187-5-vernon2gm@gmail.com Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (arm) Reviewed-by: Lance Yang Reviewed-by: Barry Song Cc: Baolin Wang Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 1 + mm/khugepaged.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 384e29f6bef0..bcdc57eea270 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -25,6 +25,7 @@ EM( SCAN_PAGE_LRU, "page_not_in_lru") \ EM( SCAN_PAGE_LOCK, "page_locked") \ EM( SCAN_PAGE_ANON, "page_not_anon") \ + EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \ EM( SCAN_PAGE_COMPOUND, "page_compound") \ EM( SCAN_ANY_PROCESS, "no_process_for_page") \ EM( SCAN_VMA_NULL, "vma_null") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8ee3c44bc851..13b0fe50dfc5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -46,6 +46,7 @@ enum scan_result { SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, + SCAN_PAGE_LAZYFREE, SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, @@ -577,6 +578,16 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); + /* + * If the vma has the VM_DROPPABLE flag, the collapse will + * preserve the lazyfree property without needing to skip. + */ + if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && + folio_test_lazyfree(folio) && !pte_dirty(pteval)) { + result = SCAN_PAGE_LAZYFREE; + goto out; + } + /* See hpage_collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) { ++shared; @@ -1325,6 +1336,16 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, } folio = page_folio(page); + /* + * If the vma has the VM_DROPPABLE flag, the collapse will + * preserve the lazyfree property without needing to skip. + */ + if (cc->is_khugepaged && !(vma->vm_flags & VM_DROPPABLE) && + folio_test_lazyfree(folio) && !pte_dirty(pteval)) { + result = SCAN_PAGE_LAZYFREE; + goto out_unmap; + } + if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; From 514c2fe9927e91f44eb3f53f2e175d232bd2a989 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 23 Feb 2026 17:39:20 +0100 Subject: [PATCH 078/369] mm: centralize+fix comments about compound_mapcount() in new sync_with_folio_pmd_zap() We still mention compound_mapcount() in two comments. Instead of simply referring to the folio mapcount in both places, let's factor out the odd-looking PTL sync into sync_with_folio_pmd_zap(), and add centralized documentation why this is required. [akpm@linux-foundation.org: update comment per Matthew and David] Link: https://lkml.kernel.org/r/20260223163920.287720-1-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Cc: Lorenzo Stoakes Cc: Liam Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Rik van Riel Cc: Harry Yoo Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/internal.h | 19 +++++++++++++++++++ mm/memory.c | 8 +------- mm/page_vma_mapped.c | 11 ++--------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index cb0af847d7d9..39ab37bb0e1d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -516,6 +516,25 @@ void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); +/** + * sync_with_folio_pmd_zap - sync with concurrent zapping of a folio PMD + * @mm: The mm_struct. + * @pmdp: Pointer to the pmd that was found to be pmd_none(). + * + * When we find a pmd_none() while unmapping a folio without holding the PTL, + * zap_huge_pmd() may have cleared the PMD but not yet modified the folio to + * indicate that it's unmapped. Skipping the PMD without synchronization could + * make folio unmapping code assume that unmapping failed. + * + * Wait for concurrent zapping to complete by grabbing the PTL. + */ +static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp) +{ + spinlock_t *ptl = pmd_lock(mm, pmdp); + + spin_unlock(ptl); +} + struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/memory.c b/mm/memory.c index af26a697562b..f78ab3869f8d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1993,13 +1993,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { - spinlock_t *ptl = pmd_lock(tlb->mm, pmd); - /* - * Take and drop THP pmd lock so that we cannot return - * prematurely, while zap_huge_pmd() has cleared *pmd, - * but not yet decremented compound_mapcount(). - */ - spin_unlock(ptl); + sync_with_folio_pmd_zap(tlb->mm, pmd); } if (pmd_none(*pmd)) { addr = next; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index b38a1d00c971..a4d52fdb3056 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -269,11 +269,6 @@ restart: spin_unlock(pvmw->ptl); pvmw->ptl = NULL; } else if (!pmd_present(pmde)) { - /* - * If PVMW_SYNC, take and drop THP pmd lock so that we - * cannot return prematurely, while zap_huge_pmd() has - * cleared *pmd but not decremented compound_mapcount(). - */ const softleaf_t entry = softleaf_from_pmd(pmde); if (softleaf_is_device_private(entry)) { @@ -284,11 +279,9 @@ restart: if ((pvmw->flags & PVMW_SYNC) && thp_vma_suitable_order(vma, pvmw->address, PMD_ORDER) && - (pvmw->nr_pages >= HPAGE_PMD_NR)) { - spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + (pvmw->nr_pages >= HPAGE_PMD_NR)) + sync_with_folio_pmd_zap(mm, pvmw->pmd); - spin_unlock(ptl); - } step_forward(pvmw, PMD_SIZE); continue; } From 63de231ef02afbe5b2a6277c3bdf8d0d7f7e1d21 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Fri, 13 Feb 2026 09:59:12 +0100 Subject: [PATCH 079/369] kho: move alloc tag init to kho_init_{folio,pages}() Commit 8f1081892d62 ("kho: simplify page initialization in kho_restore_page()") cleaned up the page initialization logic by moving the folio and 0-order-page paths into separate functions. It missed moving the alloc tag initialization. Do it now to keep the two paths cleanly separated. While at it, touch up the comments to be a tiny bit shorter (mainly so it doesn't end up splitting into a multiline comment). This is purely a cosmetic change and there should be no change in behaviour. Link: https://lkml.kernel.org/r/20260213085914.2778107-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Cc: Alexander Graf Cc: Mike Rapoport Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index cc68a3692905..4356f277b462 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -219,8 +219,11 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, /* For physically contiguous 0-order pages. */ static void kho_init_pages(struct page *page, unsigned long nr_pages) { - for (unsigned long i = 0; i < nr_pages; i++) + for (unsigned long i = 0; i < nr_pages; i++) { set_page_count(page + i, 1); + /* Clear each page's codetag to avoid accounting mismatch. */ + clear_page_tag_ref(page + i); + } } static void kho_init_folio(struct page *page, unsigned int order) @@ -229,6 +232,8 @@ static void kho_init_folio(struct page *page, unsigned int order) /* Head page gets refcount of 1. */ set_page_count(page, 1); + /* Clear head page's codetag to avoid accounting mismatch. */ + clear_page_tag_ref(page); /* For higher order folios, tail pages get a page count of zero. */ for (unsigned long i = 1; i < nr_pages; i++) @@ -265,14 +270,6 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) else kho_init_pages(page, nr_pages); - /* Always mark headpage's codetag as empty to avoid accounting mismatch */ - clear_page_tag_ref(page); - if (!is_folio) { - /* Also do that for the non-compound tail pages */ - for (unsigned int i = 1; i < nr_pages; i++) - clear_page_tag_ref(page + i); - } - adjust_managed_page_count(page, nr_pages); return page; } From 3f2ad90060f65d6f66414b8a67c569154bafec7b Mon Sep 17 00:00:00 2001 From: Jason Miu Date: Thu, 5 Feb 2026 18:14:27 -0800 Subject: [PATCH 080/369] kho: adopt radix tree for preserved memory tracking Patch series "Make KHO Stateless", v9. This series transitions KHO from an xarray-based metadata tracking system with serialization to a radix tree data structure that can be passed directly to the next kernel. The key motivations for this change are to: - Eliminate the need for data serialization before kexec. - Remove the KHO finalize state. - Pass preservation metadata more directly to the next kernel via the FDT. The new approach uses a radix tree to mark preserved pages. A page's physical address and its order are encoded into a single value. The tree is composed of multiple levels of page-sized tables, with leaf nodes being bitmaps where each set bit represents a preserved page. The physical address of the radix tree's root is passed in the FDT, allowing the next kernel to reconstruct the preserved memory map. This series is broken down into the following patches: 1. kho: Adopt radix tree for preserved memory tracking: Replaces the xarray-based tracker with the new radix tree implementation and increments the ABI version. 2. kho: Remove finalize state and clients: Removes the now-obsolete kho_finalize() function and its usage from client code and debugfs. This patch (of 2): Introduce a radix tree implementation for tracking preserved memory pages and switch the KHO memory tracking mechanism to use it. This lays the groundwork for a stateless KHO implementation that eliminates the need for serialization and the associated "finalize" state. This patch introduces the core radix tree data structures and constants to the KHO ABI. It adds the radix tree node and leaf structures, along with documentation for the radix tree key encoding scheme that combines a page's physical address and order. To support broader use by other kernel subsystems, such as hugetlb preservation, the core radix tree manipulation functions are exported as a public API. The xarray-based memory tracking is replaced with this new radix tree implementation. The core KHO preservation and unpreservation functions are wired up to use the radix tree helpers. On boot, the second kernel restores the preserved memory map by walking the radix tree whose root physical address is passed via the FDT. The ABI `compatible` version is bumped to "kho-v2" to reflect the structural changes in the preserved memory map and sub-FDT property names. This includes renaming "fdt" to "preserved-data" to better reflect that preserved state may use formats other than FDT. [ran.xiaokai@zte.com.cn: fix child node parsing for debugfs in/sub_fdts] Link: https://lkml.kernel.org/r/20260309033530.244508-1-ranxiaokai627@163.com Link: https://lkml.kernel.org/r/20260206021428.3386442-1-jasonmiu@google.com Link: https://lkml.kernel.org/r/20260206021428.3386442-2-jasonmiu@google.com Signed-off-by: Jason Miu Signed-off-by: Ran Xiaokai Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: David Matlack Cc: David Rientjes Cc: Jason Gunthorpe Cc: Pratyush Yadav Cc: Ran Xiaokai Signed-off-by: Andrew Morton --- Documentation/core-api/kho/abi.rst | 6 + Documentation/core-api/kho/index.rst | 12 + include/linux/kho/abi/kexec_handover.h | 144 ++++- include/linux/kho_radix_tree.h | 70 +++ kernel/liveupdate/kexec_handover.c | 651 +++++++++++---------- kernel/liveupdate/kexec_handover_debugfs.c | 3 +- 6 files changed, 563 insertions(+), 323 deletions(-) create mode 100644 include/linux/kho_radix_tree.h diff --git a/Documentation/core-api/kho/abi.rst b/Documentation/core-api/kho/abi.rst index 2e63be3486cf..799d743105a6 100644 --- a/Documentation/core-api/kho/abi.rst +++ b/Documentation/core-api/kho/abi.rst @@ -22,6 +22,12 @@ memblock preservation ABI .. kernel-doc:: include/linux/kho/abi/memblock.h :doc: memblock kexec handover ABI +KHO persistent memory tracker ABI +================================= + +.. kernel-doc:: include/linux/kho/abi/kexec_handover.h + :doc: KHO persistent memory tracker + See Also ======== diff --git a/Documentation/core-api/kho/index.rst b/Documentation/core-api/kho/index.rst index dcc6a36cc134..002bdf0beb2e 100644 --- a/Documentation/core-api/kho/index.rst +++ b/Documentation/core-api/kho/index.rst @@ -83,6 +83,18 @@ called serialization. When the FDT is generated, some properties of the system may become immutable because they are already written down in the FDT. That state is called the KHO finalization phase. +Kexec Handover Radix Tree +========================= + +.. kernel-doc:: include/linux/kho_radix_tree.h + :doc: Kexec Handover Radix Tree + +Public API +========== + +.. kernel-doc:: kernel/liveupdate/kexec_handover.c + :export: + See Also ======== diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 2201a0d2c159..6b7d8ef550f9 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -10,8 +10,13 @@ #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H +#include +#include +#include #include +#include + /** * DOC: Kexec Handover ABI * @@ -29,32 +34,32 @@ * compatibility is only guaranteed for kernels supporting the same ABI version. * * FDT Structure Overview: - * The FDT serves as a central registry for physical - * addresses of preserved data structures and sub-FDTs. The first kernel - * populates this FDT with references to memory regions and other FDTs that - * need to persist across the kexec transition. The subsequent kernel then - * parses this FDT to locate and restore the preserved data.:: + * The FDT serves as a central registry for physical addresses of preserved + * data structures. The first kernel populates this FDT with references to + * memory regions and other metadata that need to persist across the kexec + * transition. The subsequent kernel then parses this FDT to locate and + * restore the preserved data.:: * * / { - * compatible = "kho-v1"; + * compatible = "kho-v2"; * * preserved-memory-map = <0x...>; * * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * ... ... * { - * fdt = <0x...>; + * preserved-data = <0x...>; * }; * }; * * Root KHO Node (/): - * - compatible: "kho-v1" + * - compatible: "kho-v2" * * Indentifies the overall KHO ABI version. * @@ -69,20 +74,20 @@ * is provided by the subsystem that uses KHO for preserving its * data. * - * - fdt: u64 + * - preserved-data: u64 * - * Physical address pointing to a subnode FDT blob that is also + * Physical address pointing to a subnode data blob that is also * being preserved. */ /* The compatible string for the KHO FDT root node. */ -#define KHO_FDT_COMPATIBLE "kho-v1" +#define KHO_FDT_COMPATIBLE "kho-v2" /* The FDT property for the preserved memory map. */ #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" -/* The FDT property for sub-FDTs. */ -#define KHO_FDT_SUB_TREE_PROP_NAME "fdt" +/* The FDT property for preserved data blobs. */ +#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data" /** * DOC: Kexec Handover ABI for vmalloc Preservation @@ -160,4 +165,113 @@ struct kho_vmalloc { unsigned short order; }; +/** + * DOC: KHO persistent memory tracker + * + * KHO tracks preserved memory using a radix tree data structure. Each node of + * the tree is exactly a single page. The leaf nodes are bitmaps where each set + * bit is a preserved page of any order. The intermediate nodes are tables of + * physical addresses that point to a lower level node. + * + * The tree hierarchy is shown below:: + * + * root + * +-------------------+ + * | Level 5 | (struct kho_radix_node) + * +-------------------+ + * | + * v + * +-------------------+ + * | Level 4 | (struct kho_radix_node) + * +-------------------+ + * | + * | ... (intermediate levels) + * | + * v + * +-------------------+ + * | Level 0 | (struct kho_radix_leaf) + * +-------------------+ + * + * The tree is traversed using a key that encodes the page's physical address + * (pa) and its order into a single unsigned long value. The encoded key value + * is composed of two parts: the 'order bit' in the upper part and the + * 'shifted physical address' in the lower part.:: + * + * +------------+-----------------------------+--------------------------+ + * | Page Order | Order Bit | Shifted Physical Address | + * +------------+-----------------------------+--------------------------+ + * | 0 | ...000100 ... (at bit 52) | pa >> (PAGE_SHIFT + 0) | + * | 1 | ...000010 ... (at bit 51) | pa >> (PAGE_SHIFT + 1) | + * | 2 | ...000001 ... (at bit 50) | pa >> (PAGE_SHIFT + 2) | + * | ... | ... | ... | + * +------------+-----------------------------+--------------------------+ + * + * Shifted Physical Address: + * The 'shifted physical address' is the physical address normalized for its + * order. It effectively represents the PFN shifted right by the order. + * + * Order Bit: + * The 'order bit' encodes the page order by setting a single bit at a + * specific position. The position of this bit itself represents the order. + * + * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the + * maximum range for the shifted physical address (for order 0) is 52 bits + * (64 - 12). This address occupies bits [0-51]. For order 0, the order bit is + * set at position 52. + * + * The following diagram illustrates how the encoded key value is split into + * indices for the tree levels, with PAGE_SIZE of 4KB:: + * + * 63:60 59:51 50:42 41:33 32:24 23:15 14:0 + * +---------+--------+--------+--------+--------+--------+-----------------+ + * | 0 | Lv 5 | Lv 4 | Lv 3 | Lv 2 | Lv 1 | Lv 0 (bitmap) | + * +---------+--------+--------+--------+--------+--------+-----------------+ + * + * The radix tree stores pages of all orders in a single 6-level hierarchy. It + * efficiently shares higher tree levels, especially due to common zero top + * address bits, allowing a single, efficient algorithm to manage all + * pages. This bitmap approach also offers memory efficiency; for example, a + * 512KB bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE = + * 4KB. + * + * The data structures defined here are part of the KHO ABI. Any modification + * to these structures that breaks backward compatibility must be accompanied by + * an update to the "compatible" string. This ensures that a newer kernel can + * correctly interpret the data passed by an older kernel. + */ + +/* + * Defines constants for the KHO radix tree structure, used to track preserved + * memory. These constants govern the indexing, sizing, and depth of the tree. + */ +enum kho_radix_consts { + /* + * The bit position of the order bit (and also the length of the + * shifted physical address) for an order-0 page. + */ + KHO_ORDER_0_LOG2 = 64 - PAGE_SHIFT, + + /* Size of the table in kho_radix_node, in log2 */ + KHO_TABLE_SIZE_LOG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)), + + /* Number of bits in the kho_radix_leaf bitmap, in log2 */ + KHO_BITMAP_SIZE_LOG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE), + + /* + * The total tree depth is the number of intermediate levels + * and 1 bitmap level. + */ + KHO_TREE_MAX_DEPTH = + DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2, + KHO_TABLE_SIZE_LOG2) + 1, +}; + +struct kho_radix_node { + u64 table[1 << KHO_TABLE_SIZE_LOG2]; +}; + +struct kho_radix_leaf { + DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LOG2); +}; + #endif /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */ diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h new file mode 100644 index 000000000000..84e918b96e53 --- /dev/null +++ b/include/linux/kho_radix_tree.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_KHO_RADIX_TREE_H +#define _LINUX_KHO_RADIX_TREE_H + +#include +#include +#include +#include + +/** + * DOC: Kexec Handover Radix Tree + * + * This is a radix tree implementation for tracking physical memory pages + * across kexec transitions. It was developed for the KHO mechanism but is + * designed for broader use by any subsystem that needs to preserve pages. + * + * The radix tree is a multi-level tree where leaf nodes are bitmaps + * representing individual pages. To allow pages of different sizes (orders) + * to be stored efficiently in a single tree, it uses a unique key encoding + * scheme. Each key is an unsigned long that combines a page's physical + * address and its order. + * + * Client code is responsible for allocating the root node of the tree, + * initializing the mutex lock, and managing its lifecycle. It must use the + * tree data structures defined in the KHO ABI, + * `include/linux/kho/abi/kexec_handover.h`. + */ + +struct kho_radix_node; + +struct kho_radix_tree { + struct kho_radix_node *root; + struct mutex lock; /* protects the tree's structure and root pointer */ +}; + +typedef int (*kho_radix_tree_walk_callback_t)(phys_addr_t phys, + unsigned int order); + +#ifdef CONFIG_KEXEC_HANDOVER + +int kho_radix_add_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order); + +void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order); + +int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb); + +#else /* #ifdef CONFIG_KEXEC_HANDOVER */ + +static inline int kho_radix_add_page(struct kho_radix_tree *tree, long pfn, + unsigned int order) +{ + return -EOPNOTSUPP; +} + +static inline void kho_radix_del_page(struct kho_radix_tree *tree, + unsigned long pfn, unsigned int order) { } + +static inline int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb) +{ + return -EOPNOTSUPP; +} + +#endif /* #ifdef CONFIG_KEXEC_HANDOVER */ + +#endif /* _LINUX_KHO_RADIX_TREE_H */ diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 4356f277b462..ad877926f3f6 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -5,6 +5,7 @@ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport * Copyright (C) 2025 Google LLC, Changyuan Lyu * Copyright (C) 2025 Pasha Tatashin + * Copyright (C) 2026 Google LLC, Jason Miu */ #define pr_fmt(fmt) "KHO: " fmt @@ -15,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -64,158 +66,310 @@ static int __init kho_parse_enable(char *p) } early_param("kho", kho_parse_enable); -/* - * Keep track of memory that is to be preserved across KHO. - * - * The serializing side uses two levels of xarrays to manage chunks of per-order - * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order - * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 - * allocations each bitmap will cover 128M of address space. Thus, for 16G of - * memory at most 512K of bitmap memory will be needed for order 0. - * - * This approach is fully incremental, as the serialization progresses folios - * can continue be aggregated to the tracker. The final step, immediately prior - * to kexec would serialize the xarray information into a linked list for the - * successor kernel to parse. - */ - -#define PRESERVE_BITS (PAGE_SIZE * 8) - -struct kho_mem_phys_bits { - DECLARE_BITMAP(preserve, PRESERVE_BITS); -}; - -static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); - -struct kho_mem_phys { - /* - * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized - * to order. - */ - struct xarray phys_bits; -}; - -struct kho_mem_track { - /* Points to kho_mem_phys, each order gets its own bitmap tree */ - struct xarray orders; -}; - -struct khoser_mem_chunk; - struct kho_out { void *fdt; bool finalized; struct mutex lock; /* protects KHO FDT finalization */ - struct kho_mem_track track; + struct kho_radix_tree radix_tree; struct kho_debugfs dbg; }; static struct kho_out kho_out = { .lock = __MUTEX_INITIALIZER(kho_out.lock), - .track = { - .orders = XARRAY_INIT(kho_out.track.orders, 0), + .radix_tree = { + .lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock), }, .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) +/** + * kho_radix_encode_key - Encodes a physical address and order into a radix key. + * @phys: The physical address of the page. + * @order: The order of the page. + * + * This function combines a page's physical address and its order into a + * single unsigned long, which is used as a key for all radix tree + * operations. + * + * Return: The encoded unsigned long radix key. + */ +static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order) { - void *res = xa_load(xa, index); + /* Order bits part */ + unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order); + /* Shifted physical address part */ + unsigned long l = phys >> (PAGE_SHIFT + order); - if (res) - return res; - - void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); - - if (!elm) - return ERR_PTR(-ENOMEM); - - if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) - return ERR_PTR(-EINVAL); - - res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); - if (xa_is_err(res)) - return ERR_PTR(xa_err(res)); - else if (res) - return res; - - return no_free_ptr(elm); + return h | l; } -static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, - unsigned int order) +/** + * kho_radix_decode_key - Decodes a radix key back into a physical address and order. + * @key: The unsigned long key to decode. + * @order: An output parameter, a pointer to an unsigned int where the decoded + * page order will be stored. + * + * This function reverses the encoding performed by kho_radix_encode_key(), + * extracting the original physical address and page order from a given key. + * + * Return: The decoded physical address. + */ +static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order) { - struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa; - const unsigned long pfn_high = pfn >> order; + unsigned int order_bit = fls64(key); + phys_addr_t phys; - physxa = xa_load(&track->orders, order); - if (WARN_ON_ONCE(!physxa)) - return; + /* order_bit is numbered starting at 1 from fls64 */ + *order = KHO_ORDER_0_LOG2 - order_bit + 1; + /* The order is discarded by the shift */ + phys = key << (PAGE_SHIFT + *order); - bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (WARN_ON_ONCE(!bits)) - return; - - clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + return phys; } -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +static unsigned long kho_radix_get_bitmap_index(unsigned long key) +{ + return key % (1 << KHO_BITMAP_SIZE_LOG2); +} + +static unsigned long kho_radix_get_table_index(unsigned long key, + unsigned int level) +{ + int s; + + s = ((level - 1) * KHO_TABLE_SIZE_LOG2) + KHO_BITMAP_SIZE_LOG2; + return (key >> s) % (1 << KHO_TABLE_SIZE_LOG2); +} + +/** + * kho_radix_add_page - Marks a page as preserved in the radix tree. + * @tree: The KHO radix tree. + * @pfn: The page frame number of the page to preserve. + * @order: The order of the page. + * + * This function traverses the radix tree based on the key derived from @pfn + * and @order. It sets the corresponding bit in the leaf bitmap to mark the + * page for preservation. If intermediate nodes do not exist along the path, + * they are allocated and added to the tree. + * + * Return: 0 on success, or a negative error code on failure. + */ +int kho_radix_add_page(struct kho_radix_tree *tree, + unsigned long pfn, unsigned int order) +{ + /* Newly allocated nodes for error cleanup */ + struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 }; + unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order); + struct kho_radix_node *anchor_node = NULL; + struct kho_radix_node *node = tree->root; + struct kho_radix_node *new_node; + unsigned int i, idx, anchor_idx; + struct kho_radix_leaf *leaf; + int err = 0; + + if (WARN_ON_ONCE(!tree->root)) + return -EINVAL; + + might_sleep(); + + guard(mutex)(&tree->lock); + + /* Go from high levels to low levels */ + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + idx = kho_radix_get_table_index(key, i); + + if (node->table[idx]) { + node = phys_to_virt(node->table[idx]); + continue; + } + + /* Next node is empty, create a new node for it */ + new_node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL); + if (!new_node) { + err = -ENOMEM; + goto err_free_nodes; + } + + node->table[idx] = virt_to_phys(new_node); + + /* + * Capture the node where the new branch starts for cleanup + * if allocation fails. + */ + if (!anchor_node) { + anchor_node = node; + anchor_idx = idx; + } + intermediate_nodes[i] = new_node; + + node = new_node; + } + + /* Handle the leaf level bitmap (level 0) */ + idx = kho_radix_get_bitmap_index(key); + leaf = (struct kho_radix_leaf *)node; + __set_bit(idx, leaf->bitmap); + + return 0; + +err_free_nodes: + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + if (intermediate_nodes[i]) + free_page((unsigned long)intermediate_nodes[i]); + } + if (anchor_node) + anchor_node->table[anchor_idx] = 0; + + return err; +} +EXPORT_SYMBOL_GPL(kho_radix_add_page); + +/** + * kho_radix_del_page - Removes a page's preservation status from the radix tree. + * @tree: The KHO radix tree. + * @pfn: The page frame number of the page to unpreserve. + * @order: The order of the page. + * + * This function traverses the radix tree and clears the bit corresponding to + * the page, effectively removing its "preserved" status. It does not free + * the tree's intermediate nodes, even if they become empty. + */ +void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order) +{ + unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order); + struct kho_radix_node *node = tree->root; + struct kho_radix_leaf *leaf; + unsigned int i, idx; + + if (WARN_ON_ONCE(!tree->root)) + return; + + might_sleep(); + + guard(mutex)(&tree->lock); + + /* Go from high levels to low levels */ + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + idx = kho_radix_get_table_index(key, i); + + /* + * Attempting to delete a page that has not been preserved, + * return with a warning. + */ + if (WARN_ON(!node->table[idx])) + return; + + node = phys_to_virt(node->table[idx]); + } + + /* Handle the leaf level bitmap (level 0) */ + leaf = (struct kho_radix_leaf *)node; + idx = kho_radix_get_bitmap_index(key); + __clear_bit(idx, leaf->bitmap); +} +EXPORT_SYMBOL_GPL(kho_radix_del_page); + +static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, + unsigned long key, + kho_radix_tree_walk_callback_t cb) +{ + unsigned long *bitmap = (unsigned long *)leaf; + unsigned int order; + phys_addr_t phys; + unsigned int i; + int err; + + for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) { + phys = kho_radix_decode_key(key | i, &order); + err = cb(phys, order); + if (err) + return err; + } + + return 0; +} + +static int __kho_radix_walk_tree(struct kho_radix_node *root, + unsigned int level, unsigned long start, + kho_radix_tree_walk_callback_t cb) +{ + struct kho_radix_node *node; + struct kho_radix_leaf *leaf; + unsigned long key, i; + unsigned int shift; + int err; + + for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) { + if (!root->table[i]) + continue; + + shift = ((level - 1) * KHO_TABLE_SIZE_LOG2) + + KHO_BITMAP_SIZE_LOG2; + key = start | (i << shift); + + node = phys_to_virt(root->table[i]); + + if (level == 1) { + /* + * we are at level 1, + * node is pointing to the level 0 bitmap. + */ + leaf = (struct kho_radix_leaf *)node; + err = kho_radix_walk_leaf(leaf, key, cb); + } else { + err = __kho_radix_walk_tree(node, level - 1, + key, cb); + } + + if (err) + return err; + } + + return 0; +} + +/** + * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page. + * @tree: A pointer to the KHO radix tree to walk. + * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be + * invoked for each preserved page found in the tree. The callback receives + * the physical address and order of the preserved page. + * + * This function walks the radix tree, searching from the specified top level + * down to the lowest level (level 0). For each preserved page found, it invokes + * the provided callback, passing the page's physical address and order. + * + * Return: 0 if the walk completed the specified tree, or the non-zero return + * value from the callback that stopped the walk. + */ +int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb) +{ + if (WARN_ON_ONCE(!tree->root)) + return -EINVAL; + + guard(mutex)(&tree->lock); + + return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb); +} +EXPORT_SYMBOL_GPL(kho_radix_walk_tree); + +static void __kho_unpreserve(struct kho_radix_tree *tree, + unsigned long pfn, unsigned long end_pfn) { unsigned int order; while (pfn < end_pfn) { order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - __kho_unpreserve_order(track, pfn, order); + kho_radix_del_page(tree, pfn, order); pfn += 1 << order; } } -static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, - unsigned int order) -{ - struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa, *new_physxa; - const unsigned long pfn_high = pfn >> order; - - might_sleep(); - physxa = xa_load(&track->orders, order); - if (!physxa) { - int err; - - new_physxa = kzalloc_obj(*physxa); - if (!new_physxa) - return -ENOMEM; - - xa_init(&new_physxa->phys_bits); - physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, - GFP_KERNEL); - - err = xa_err(physxa); - if (err || physxa) { - xa_destroy(&new_physxa->phys_bits); - kfree(new_physxa); - - if (err) - return err; - } else { - physxa = new_physxa; - } - } - - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (IS_ERR(bits)) - return PTR_ERR(bits); - - set_bit(pfn_high % PRESERVE_BITS, bits->preserve); - - return 0; -} - /* For physically contiguous 0-order pages. */ static void kho_init_pages(struct page *page, unsigned long nr_pages) { @@ -318,161 +472,24 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages) } EXPORT_SYMBOL_GPL(kho_restore_pages); -/* Serialize and deserialize struct kho_mem_phys across kexec - * - * Record all the bitmaps in a linked list of pages for the next kernel to - * process. Each chunk holds bitmaps of the same order and each block of bitmaps - * starts at a given physical address. This allows the bitmaps to be sparse. The - * xarray is used to store them in a tree while building up the data structure, - * but the KHO successor kernel only needs to process them once in order. - * - * All of this memory is normal kmalloc() memory and is not marked for - * preservation. The successor kernel will remain isolated to the scratch space - * until it completes processing this list. Once processed all the memory - * storing these ranges will be marked as free. - */ - -struct khoser_mem_bitmap_ptr { - phys_addr_t phys_start; - DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); -}; - -struct khoser_mem_chunk_hdr { - DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); - unsigned int order; - unsigned int num_elms; -}; - -#define KHOSER_BITMAP_SIZE \ - ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ - sizeof(struct khoser_mem_bitmap_ptr)) - -struct khoser_mem_chunk { - struct khoser_mem_chunk_hdr hdr; - struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; -}; - -static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); - -static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, - unsigned long order) +static int __init kho_preserved_memory_reserve(phys_addr_t phys, + unsigned int order) { - struct khoser_mem_chunk *chunk __free(free_page) = NULL; + union kho_page_info info; + struct page *page; + u64 sz; - chunk = (void *)get_zeroed_page(GFP_KERNEL); - if (!chunk) - return ERR_PTR(-ENOMEM); + sz = 1 << (order + PAGE_SHIFT); + page = phys_to_page(phys); - if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) - return ERR_PTR(-EINVAL); - - chunk->hdr.order = order; - if (cur_chunk) - KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return no_free_ptr(chunk); -} - -static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) -{ - struct khoser_mem_chunk *chunk = first_chunk; - - while (chunk) { - struct khoser_mem_chunk *tmp = chunk; - - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - free_page((unsigned long)tmp); - } -} - -/* - * Update memory map property, if old one is found discard it via - * kho_mem_ser_free(). - */ -static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) -{ - void *ptr; - u64 phys; - - ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL); - - /* Check and discard previous memory map */ - phys = get_unaligned((u64 *)ptr); - if (phys) - kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); - - /* Update with the new value */ - phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; - put_unaligned(phys, (u64 *)ptr); -} - -static int kho_mem_serialize(struct kho_out *kho_out) -{ - struct khoser_mem_chunk *first_chunk = NULL; - struct khoser_mem_chunk *chunk = NULL; - struct kho_mem_phys *physxa; - unsigned long order; - int err = -ENOMEM; - - xa_for_each(&kho_out->track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; - - chunk = new_chunk(chunk, order); - if (IS_ERR(chunk)) { - err = PTR_ERR(chunk); - goto err_free; - } - - if (!first_chunk) - first_chunk = chunk; - - xa_for_each(&physxa->phys_bits, phys, bits) { - struct khoser_mem_bitmap_ptr *elm; - - if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { - chunk = new_chunk(chunk, order); - if (IS_ERR(chunk)) { - err = PTR_ERR(chunk); - goto err_free; - } - } - - elm = &chunk->bitmaps[chunk->hdr.num_elms]; - chunk->hdr.num_elms++; - elm->phys_start = (phys * PRESERVE_BITS) - << (order + PAGE_SHIFT); - KHOSER_STORE_PTR(elm->bitmap, bits); - } - } - - kho_update_memory_map(first_chunk); + /* Reserve the memory preserved in KHO in memblock */ + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + info.magic = KHO_PAGE_MAGIC; + info.order = order; + page->private = info.page_private; return 0; - -err_free: - kho_mem_ser_free(first_chunk); - return err; -} - -static void __init deserialize_bitmap(unsigned int order, - struct khoser_mem_bitmap_ptr *elm) -{ - struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); - unsigned long bit; - - for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { - int sz = 1 << (order + PAGE_SHIFT); - phys_addr_t phys = - elm->phys_start + (bit << (order + PAGE_SHIFT)); - struct page *page = phys_to_page(phys); - union kho_page_info info; - - memblock_reserve(phys, sz); - memblock_reserved_mark_noinit(phys, sz); - info.magic = KHO_PAGE_MAGIC; - info.order = order; - page->private = info.page_private; - } } /* Returns physical address of the preserved memory map from FDT */ @@ -483,25 +500,13 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); if (!mem_ptr || len != sizeof(u64)) { - pr_err("failed to get preserved memory bitmaps\n"); + pr_err("failed to get preserved memory map\n"); return 0; } return get_unaligned((const u64 *)mem_ptr); } -static void __init kho_mem_deserialize(struct khoser_mem_chunk *chunk) -{ - while (chunk) { - unsigned int i; - - for (i = 0; i != chunk->hdr.num_elms; i++) - deserialize_bitmap(chunk->hdr.order, - &chunk->bitmaps[i]); - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - } -} - /* * With KHO enabled, memory can become fragmented because KHO regions may * be anywhere in physical address space. The scratch regions give us a @@ -812,14 +817,14 @@ EXPORT_SYMBOL_GPL(kho_remove_subtree); */ int kho_preserve_folio(struct folio *folio) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.track; if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) return -EINVAL; - return __kho_preserve_order(track, pfn, order); + return kho_radix_add_page(tree, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -833,11 +838,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ void kho_unpreserve_folio(struct folio *folio) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.track; - __kho_unpreserve_order(track, pfn, order); + kho_radix_del_page(tree, pfn, order); } EXPORT_SYMBOL_GPL(kho_unpreserve_folio); @@ -853,7 +858,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio); */ int kho_preserve_pages(struct page *page, unsigned long nr_pages) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -869,7 +874,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - err = __kho_preserve_order(track, pfn, order); + err = kho_radix_add_page(tree, pfn, order); if (err) { failed_pfn = pfn; break; @@ -879,7 +884,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) } if (err) - __kho_unpreserve(track, start_pfn, failed_pfn); + __kho_unpreserve(tree, start_pfn, failed_pfn); return err; } @@ -897,11 +902,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages); */ void kho_unpreserve_pages(struct page *page, unsigned long nr_pages) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; - __kho_unpreserve(track, start_pfn, end_pfn); + __kho_unpreserve(tree, start_pfn, end_pfn); } EXPORT_SYMBOL_GPL(kho_unpreserve_pages); @@ -960,14 +965,14 @@ err_free: static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, unsigned short order) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(tree, pfn, pfn + 1); for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + (1 << order)); + __kho_unpreserve(tree, pfn, pfn + (1 << order)); } } @@ -1238,16 +1243,10 @@ EXPORT_SYMBOL_GPL(kho_restore_free); int kho_finalize(void) { - int ret; - if (!kho_enable) return -EOPNOTSUPP; guard(mutex)(&kho_out.lock); - ret = kho_mem_serialize(&kho_out); - if (ret) - return ret; - kho_out.finalized = true; return 0; @@ -1262,7 +1261,6 @@ bool kho_finalized(void) struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; - phys_addr_t mem_map_phys; struct kho_debugfs dbg; }; @@ -1330,18 +1328,46 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); +static int __init kho_mem_retrieve(const void *fdt) +{ + struct kho_radix_tree tree; + const phys_addr_t *mem; + int len; + + /* Retrieve the KHO radix tree from passed-in FDT. */ + mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); + + if (!mem || len != sizeof(*mem)) { + pr_err("failed to get preserved KHO memory tree\n"); + return -ENOENT; + } + + if (!*mem) + return -EINVAL; + + tree.root = phys_to_virt(*mem); + mutex_init(&tree.lock); + return kho_radix_walk_tree(&tree, kho_preserved_memory_reserve); +} + static __init int kho_out_fdt_setup(void) { + struct kho_radix_tree *tree = &kho_out.radix_tree; void *root = kho_out.fdt; - u64 empty_mem_map = 0; + u64 preserved_mem_tree_pa; int err; err = fdt_create(root, PAGE_SIZE); err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map, - sizeof(empty_mem_map)); + + preserved_mem_tree_pa = virt_to_phys(tree->root); + + err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, + &preserved_mem_tree_pa, + sizeof(preserved_mem_tree_pa)); + err |= fdt_end_node(root); err |= fdt_finish(root); @@ -1350,16 +1376,23 @@ static __init int kho_out_fdt_setup(void) static __init int kho_init(void) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const void *fdt = kho_get_fdt(); int err = 0; if (!kho_enable) return 0; + tree->root = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!tree->root) { + err = -ENOMEM; + goto err_free_scratch; + } + kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); if (IS_ERR(kho_out.fdt)) { err = PTR_ERR(kho_out.fdt); - goto err_free_scratch; + goto err_free_kho_radix_tree_root; } err = kho_debugfs_init(); @@ -1405,6 +1438,9 @@ static __init int kho_init(void) err_free_fdt: kho_unpreserve_free(kho_out.fdt); +err_free_kho_radix_tree_root: + kfree(tree->root); + tree->root = NULL; err_free_scratch: kho_out.fdt = NULL; for (int i = 0; i < kho_scratch_cnt; i++) { @@ -1444,10 +1480,12 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - if (kho_in.mem_map_phys) { + if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys)); + + if (kho_mem_retrieve(kho_get_fdt())) + kho_in.fdt_phys = 0; } else { kho_reserve_scratch(); } @@ -1525,7 +1563,6 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; - kho_in.mem_map_phys = mem_map_phys; kho_scratch_cnt = scratch_cnt; populated = true; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index 2f93939168ab..548033fd8a62 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "kexec_handover_internal.h" static struct dentry *debugfs_root; @@ -139,7 +140,7 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) const char *name = fdt_get_name(fdt, child, NULL); const u64 *fdt_phys; - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!fdt_phys) continue; if (len != sizeof(*fdt_phys)) { From 6b0dd42d7681af148e13df3806f251bc3dc7c36e Mon Sep 17 00:00:00 2001 From: Jason Miu Date: Thu, 5 Feb 2026 18:14:28 -0800 Subject: [PATCH 081/369] kho: remove finalize state and clients Eliminate the `kho_finalize()` function and its associated state from the KHO subsystem. The transition to a radix tree for memory tracking makes the explicit "finalize" state and its serialization step obsolete. Remove the `kho_finalize()` and `kho_finalized()` APIs and their stub implementations. Update KHO client code and the debugfs interface to no longer call or depend on the `kho_finalize()` mechanism. Complete the move towards a stateless KHO, simplifying the overall design by removing unnecessary state management. Link: https://lkml.kernel.org/r/20260206021428.3386442-3-jasonmiu@google.com Signed-off-by: Jason Miu Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: David Matlack Cc: David Rientjes Cc: Jason Gunthorpe Cc: Pratyush Yadav Cc: Ran Xiaokai Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/kho.rst | 53 +++++---------------- Documentation/core-api/kho/index.rst | 12 ----- kernel/liveupdate/kexec_handover.c | 21 +------- kernel/liveupdate/kexec_handover_debugfs.c | 23 --------- kernel/liveupdate/kexec_handover_internal.h | 3 -- kernel/liveupdate/luo_core.c | 12 +---- tools/testing/selftests/kho/init.c | 20 -------- 7 files changed, 13 insertions(+), 131 deletions(-) diff --git a/Documentation/admin-guide/mm/kho.rst b/Documentation/admin-guide/mm/kho.rst index 6dc18ed4b886..cb9a20f64920 100644 --- a/Documentation/admin-guide/mm/kho.rst +++ b/Documentation/admin-guide/mm/kho.rst @@ -28,20 +28,10 @@ per NUMA node scratch regions on boot. Perform a KHO kexec =================== -First, before you perform a KHO kexec, you need to move the system into -the :ref:`KHO finalization phase ` :: - - $ echo 1 > /sys/kernel/debug/kho/out/finalize - -After this command, the KHO FDT is available in -``/sys/kernel/debug/kho/out/fdt``. Other subsystems may also register -their own preserved sub FDTs under -``/sys/kernel/debug/kho/out/sub_fdts/``. - -Next, load the target payload and kexec into it. It is important that you -use the ``-s`` parameter to use the in-kernel kexec file loader, as user -space kexec tooling currently has no support for KHO with the user space -based file loader :: +To perform a KHO kexec, load the target payload and kexec into it. It +is important that you use the ``-s`` parameter to use the in-kernel +kexec file loader, as user space kexec tooling currently has no +support for KHO with the user space based file loader :: # kexec -l /path/to/bzImage --initrd /path/to/initrd -s # kexec -e @@ -52,40 +42,19 @@ For example, if you used ``reserve_mem`` command line parameter to create an early memory reservation, the new kernel will have that memory at the same physical address as the old kernel. -Abort a KHO exec -================ - -You can move the system out of KHO finalization phase again by calling :: - - $ echo 0 > /sys/kernel/debug/kho/out/active - -After this command, the KHO FDT is no longer available in -``/sys/kernel/debug/kho/out/fdt``. - debugfs Interfaces ================== +These debugfs interfaces are available when the kernel is compiled with +``CONFIG_KEXEC_HANDOVER_DEBUGFS`` enabled. + Currently KHO creates the following debugfs interfaces. Notice that these interfaces may change in the future. They will be moved to sysfs once KHO is stabilized. -``/sys/kernel/debug/kho/out/finalize`` - Kexec HandOver (KHO) allows Linux to transition the state of - compatible drivers into the next kexec'ed kernel. To do so, - device drivers will instruct KHO to preserve memory regions, - which could contain serialized kernel state. - While the state is serialized, they are unable to perform - any modifications to state that was serialized, such as - handed over memory allocations. - - When this file contains "1", the system is in the transition - state. When contains "0", it is not. To switch between the - two states, echo the respective number into this file. - ``/sys/kernel/debug/kho/out/fdt`` - When KHO state tree is finalized, the kernel exposes the - flattened device tree blob that carries its current KHO - state in this file. Kexec user space tooling can use this + The kernel exposes the flattened device tree blob that carries its + current KHO state in this file. Kexec user space tooling can use this as input file for the KHO payload image. ``/sys/kernel/debug/kho/out/scratch_len`` @@ -100,8 +69,8 @@ stabilized. it should place its payload images. ``/sys/kernel/debug/kho/out/sub_fdts/`` - In the KHO finalization phase, KHO producers register their own - FDT blob under this directory. + KHO producers can register their own FDT or another binary blob under + this directory. ``/sys/kernel/debug/kho/in/fdt`` When the kernel was booted with Kexec HandOver (KHO), diff --git a/Documentation/core-api/kho/index.rst b/Documentation/core-api/kho/index.rst index 002bdf0beb2e..0a2dee4f8e7d 100644 --- a/Documentation/core-api/kho/index.rst +++ b/Documentation/core-api/kho/index.rst @@ -71,18 +71,6 @@ for boot memory allocations and as target memory for kexec blobs, some parts of that memory region may be reserved. These reservations are irrelevant for the next KHO, because kexec can overwrite even the original kernel. -.. _kho-finalization-phase: - -KHO finalization phase -====================== - -To enable user space based kexec file loader, the kernel needs to be able to -provide the FDT that describes the current kernel's state before -performing the actual kexec. The process of generating that FDT is -called serialization. When the FDT is generated, some properties -of the system may become immutable because they are already written down -in the FDT. That state is called the KHO finalization phase. - Kexec Handover Radix Tree ========================= diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index ad877926f3f6..410098bae0bf 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -68,8 +68,7 @@ early_param("kho", kho_parse_enable); struct kho_out { void *fdt; - bool finalized; - struct mutex lock; /* protects KHO FDT finalization */ + struct mutex lock; /* protects KHO FDT */ struct kho_radix_tree radix_tree; struct kho_debugfs dbg; @@ -80,7 +79,6 @@ static struct kho_out kho_out = { .radix_tree = { .lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock), }, - .finalized = false, }; /** @@ -1241,23 +1239,6 @@ void kho_restore_free(void *mem) } EXPORT_SYMBOL_GPL(kho_restore_free); -int kho_finalize(void) -{ - if (!kho_enable) - return -EOPNOTSUPP; - - guard(mutex)(&kho_out.lock); - kho_out.finalized = true; - - return 0; -} - -bool kho_finalized(void) -{ - guard(mutex)(&kho_out.lock); - return kho_out.finalized; -} - struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index 548033fd8a62..acf368222682 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -76,24 +76,6 @@ void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) } } -static int kho_out_finalize_get(void *data, u64 *val) -{ - *val = kho_finalized(); - - return 0; -} - -static int kho_out_finalize_set(void *data, u64 val) -{ - if (val) - return kho_finalize(); - else - return -EINVAL; -} - -DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, - kho_out_finalize_set, "%llu\n"); - static int scratch_phys_show(struct seq_file *m, void *v) { for (int i = 0; i < kho_scratch_cnt; i++) @@ -199,11 +181,6 @@ __init int kho_out_debugfs_init(struct kho_debugfs *dbg) if (IS_ERR(f)) goto err_rmdir; - f = debugfs_create_file("finalize", 0600, dir, NULL, - &kho_out_finalize_fops); - if (IS_ERR(f)) - goto err_rmdir; - dbg->dir = dir; dbg->sub_fdt_dir = sub_fdt_dir; return 0; diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 0202c85ad14f..9a832a35254c 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -22,9 +22,6 @@ struct kho_debugfs {}; extern struct kho_scratch *kho_scratch; extern unsigned int kho_scratch_cnt; -bool kho_finalized(void); -int kho_finalize(void); - #ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS int kho_debugfs_init(void); void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index dda7bb57d421..84ac728d63ba 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -230,17 +230,7 @@ int liveupdate_reboot(void) luo_flb_serialize(); - err = kho_finalize(); - if (err) { - pr_err("kho_finalize failed %d\n", err); - /* - * kho_finalize() may return libfdt errors, to aboid passing to - * userspace unknown errors, change this to EAGAIN. - */ - err = -EAGAIN; - } - - return err; + return 0; } /** diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c index 6d9e91d55d68..88a41b6eba95 100644 --- a/tools/testing/selftests/kho/init.c +++ b/tools/testing/selftests/kho/init.c @@ -11,7 +11,6 @@ /* from arch/x86/include/asm/setup.h */ #define COMMAND_LINE_SIZE 2048 -#define KHO_FINALIZE "/debugfs/kho/out/finalize" #define KERNEL_IMAGE "/kernel" static int mount_filesystems(void) @@ -22,22 +21,6 @@ static int mount_filesystems(void) return mount("proc", "/proc", "proc", 0, NULL); } -static int kho_enable(void) -{ - const char enable[] = "1"; - int fd; - - fd = open(KHO_FINALIZE, O_RDWR); - if (fd < 0) - return -1; - - if (write(fd, enable, sizeof(enable)) != sizeof(enable)) - return 1; - - close(fd); - return 0; -} - static long kexec_file_load(int kernel_fd, int initrd_fd, unsigned long cmdline_len, const char *cmdline, unsigned long flags) @@ -78,9 +61,6 @@ int main(int argc, char *argv[]) if (mount_filesystems()) goto err_reboot; - if (kho_enable()) - goto err_reboot; - if (kexec_load()) goto err_reboot; From b9ec0ed907062a67a7cca2d04e7652aec06a0c35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Feb 2026 11:01:06 -0500 Subject: [PATCH 082/369] mm: vmalloc: streamline vmalloc memory accounting Use a vmstat counter instead of a custom, open-coded atomic. This has the added benefit of making the data available per-node, and prepares for cleaning up the memcg accounting as well. Link: https://lkml.kernel.org/r/20260223160147.3792777-1-hannes@cmpxchg.org Acked-by: Shakeel Butt Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Uladzislau Rezki (Sony) Cc: Joshua Hahn Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 3 ++- include/linux/mmzone.h | 1 + include/linux/vmalloc.h | 3 --- mm/vmalloc.c | 19 ++++++++++--------- mm/vmstat.c | 1 + 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a458f1e112fd..549793f44726 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -126,7 +126,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); - show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); + show_val_kb(m, "VmallocUsed: ", + global_node_page_state(NR_VMALLOC)); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 546bca95ca40..db41b18a919d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,7 @@ enum node_stat_item { NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ + NR_VMALLOC, NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index e8e94f90d686..3b02c0c6b371 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -286,8 +286,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb); #ifdef CONFIG_MMU #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -unsigned long vmalloc_nr_pages(void); - int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages); void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, @@ -304,7 +302,6 @@ static inline void set_vm_flush_reset_perms(void *addr) #else /* !CONFIG_MMU */ #define VMALLOC_TOTAL 0UL -static inline unsigned long vmalloc_nr_pages(void) { return 0; } static inline void set_vm_flush_reset_perms(void *addr) {} #endif /* CONFIG_MMU */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 61caa55a4402..e9d7c2a8c753 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1068,14 +1068,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; -unsigned long vmalloc_nr_pages(void) -{ - return atomic_long_read(&nr_vmalloc_pages); -} - static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; @@ -3476,11 +3470,11 @@ void vfree(const void *addr) * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ + if (!(vm->flags & VM_MAP_PUT_PAGES)) + dec_node_page_state(page, NR_VMALLOC); __free_page(page); cond_resched(); } - if (!(vm->flags & VM_MAP_PUT_PAGES)) - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } @@ -3668,6 +3662,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, continue; } + mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << large_order); + split_page(page, large_order); for (i = 0; i < (1U << large_order); i++) pages[nr_allocated + i] = page + i; @@ -3688,6 +3684,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; + int i; /* * A maximum allowed request is hard-coded and is 100 @@ -3711,6 +3708,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, nr_pages_request, pages + nr_allocated); + for (i = nr_allocated; i < nr_allocated + nr; i++) + inc_node_page_state(pages[i], NR_VMALLOC); + nr_allocated += nr; /* @@ -3735,6 +3735,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (unlikely(!page)) break; + mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << order); + /* * High-order allocations must be able to be treated as * independent small pages by callers (as they can with @@ -3877,7 +3879,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, vmalloc_gfp_adjust(gfp_mask, page_order), node, page_order, nr_small_pages, area->pages); - atomic_long_add(area->nr_pages, &nr_vmalloc_pages); /* All pages of vm should be charged to same memcg, so use first one. */ if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, diff --git a/mm/vmstat.c b/mm/vmstat.c index 667474773dbc..2370c6fb1fcd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1255,6 +1255,7 @@ const char * const vmstat_text[] = { [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable", [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired", [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released", + [I(NR_VMALLOC)] = "nr_vmalloc", [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack", #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack", From c466412c73c339e33e83b68770e5b556457c03de Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Feb 2026 11:01:07 -0500 Subject: [PATCH 083/369] mm: memcontrol: switch to native NR_VMALLOC vmstat counter Eliminates the custom memcg counter and results in a single, consolidated accounting call in vmalloc code. Link: https://lkml.kernel.org/r/20260223160147.3792777-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Roman Gushchin Reviewed-by: Vishal Moola (Oracle) Cc: Joshua Hahn Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 - mm/memcontrol.c | 4 ++-- mm/vmalloc.c | 16 ++++------------ 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5695776f32c8..5173a9f16721 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,7 +35,6 @@ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, - MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 75df24ffdf25..eb54cdf99624 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -317,6 +317,7 @@ static const unsigned int memcg_node_stat_items[] = { NR_SHMEM_THPS, NR_FILE_THPS, NR_ANON_THPS, + NR_VMALLOC, NR_KERNEL_STACK_KB, NR_PAGETABLE, NR_SECONDARY_PAGETABLE, @@ -352,7 +353,6 @@ static const unsigned int memcg_stat_items[] = { MEMCG_SWAP, MEMCG_SOCK, MEMCG_PERCPU_B, - MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, @@ -1364,7 +1364,7 @@ static const struct memory_stat memory_stats[] = { { "sec_pagetables", NR_SECONDARY_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, - { "vmalloc", MEMCG_VMALLOC }, + { "vmalloc", NR_VMALLOC }, { "shmem", NR_SHMEM }, #ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e9d7c2a8c753..6dda97c3799e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3459,9 +3459,6 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; @@ -3471,7 +3468,7 @@ void vfree(const void *addr) * can be freed as an array of order-0 allocations */ if (!(vm->flags & VM_MAP_PUT_PAGES)) - dec_node_page_state(page, NR_VMALLOC); + mod_lruvec_page_state(page, NR_VMALLOC, -1); __free_page(page); cond_resched(); } @@ -3662,7 +3659,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, continue; } - mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << large_order); + mod_lruvec_page_state(page, NR_VMALLOC, 1 << large_order); split_page(page, large_order); for (i = 0; i < (1U << large_order); i++) @@ -3709,7 +3706,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, pages + nr_allocated); for (i = nr_allocated; i < nr_allocated + nr; i++) - inc_node_page_state(pages[i], NR_VMALLOC); + mod_lruvec_page_state(pages[i], NR_VMALLOC, 1); nr_allocated += nr; @@ -3735,7 +3732,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (unlikely(!page)) break; - mod_node_page_state(page_pgdat(page), NR_VMALLOC, 1 << order); + mod_lruvec_page_state(page, NR_VMALLOC, 1 << order); /* * High-order allocations must be able to be treated as @@ -3879,11 +3876,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, vmalloc_gfp_adjust(gfp_mask, page_order), node, page_order, nr_small_pages, area->pages); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) - mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, - area->nr_pages); - /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. From d8d68d8111d894cf2406c2eee814ce1f4cf9e939 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Mar 2026 09:09:06 -0700 Subject: [PATCH 084/369] tracing: add __event_in_*irq() helpers Patch series "mm: vmscan: add PID and cgroup ID to vmscan tracepoints", v8. This patch (of 3): Some trace events want to expose in their output if they were triggered in an interrupt or softirq context. Instead of recording this in the event structure itself, as this information is stored in the flags portion of the event header, add helper macros that can be used in the print format: TP_printk("val=%d %s", __entry->val, __event_in_irq() ? "(in-irq)" : "") This will output "(in-irq)" for the event in the trace data if the event was triggered in hard or soft interrupt context. Link: https://lkml.kernel.org/r/20260316160908.42727-1-tballasi@linux.microsoft.com Link: https://lore.kernel.org/all/20251229132942.31a2b583@gandalf.local.home/ Link: https://lkml.kernel.org/r/20260316160908.42727-2-tballasi@linux.microsoft.com Signed-off-by: Steven Rostedt (Google) Signed-off-by: Thomas Ballasi Reviewed-by: Shakeel Butt Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/stages/stage3_trace_output.h | 8 ++++++++ include/trace/stages/stage7_class_define.h | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index fce85ea2df1c..56ec0c0595b1 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -150,3 +150,11 @@ #undef __get_buf #define __get_buf(len) trace_seq_acquire(p, (len)) + +#undef __event_in_hardirq +#undef __event_in_softirq +#undef __event_in_irq + +#define __event_in_hardirq() (__entry->ent.flags & TRACE_FLAG_HARDIRQ) +#define __event_in_softirq() (__entry->ent.flags & TRACE_FLAG_SOFTIRQ) +#define __event_in_irq() (__entry->ent.flags & (TRACE_FLAG_HARDIRQ | TRACE_FLAG_SOFTIRQ)) diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h index fcd564a590f4..47008897a795 100644 --- a/include/trace/stages/stage7_class_define.h +++ b/include/trace/stages/stage7_class_define.h @@ -26,6 +26,25 @@ #undef __print_hex_dump #undef __get_buf +#undef __event_in_hardirq +#undef __event_in_softirq +#undef __event_in_irq + +/* + * The TRACE_FLAG_* are enums. Instead of using TRACE_DEFINE_ENUM(), + * use their hardcoded values. These values are parsed by user space + * tooling elsewhere so they will never change. + * + * See "enum trace_flag_type" in linux/trace_events.h: + * TRACE_FLAG_HARDIRQ + * TRACE_FLAG_SOFTIRQ + */ + +/* This is what is displayed in the format files */ +#define __event_in_hardirq() (REC->common_flags & 0x8) +#define __event_in_softirq() (REC->common_flags & 0x10) +#define __event_in_irq() (REC->common_flags & 0x18) + /* * The below is not executed in the kernel. It is only what is * displayed in the print format for userspace to parse. From 874a0a566ede40f3d6062cae8fe1022e616edd1a Mon Sep 17 00:00:00 2001 From: Thomas Ballasi Date: Mon, 16 Mar 2026 09:09:07 -0700 Subject: [PATCH 085/369] mm: vmscan: add cgroup IDs to vmscan tracepoints Memory reclaim events are currently difficult to attribute to specific cgroups, making debugging memory pressure issues challenging. This patch adds memory cgroup ID (memcg_id) to key vmscan tracepoints to enable better correlation and analysis. For operations not associated with a specific cgroup, the field is defaulted to 0. Link: https://lkml.kernel.org/r/20260316160908.42727-3-tballasi@linux.microsoft.com Signed-off-by: Thomas Ballasi Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Steven Rostedt (Google) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 83 ++++++++++++++++++++--------------- mm/shrinker.c | 6 ++- mm/vmscan.c | 17 +++---- 3 files changed, 61 insertions(+), 45 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index ea58e4656abf..c9e637c10f96 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -124,85 +124,92 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags), + TP_ARGS(gfp_flags, order, memcg), TP_STRUCT__entry( - __field( int, order ) __field( unsigned long, gfp_flags ) + __field( u64, memcg_id ) + __field( int, order ) ), TP_fast_assign( - __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; + __entry->order = order; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("order=%d gfp_flags=%s", + TP_printk("order=%d gfp_flags=%s memcg_id=%llu", __entry->order, - show_gfp_flags(__entry->gfp_flags)) + show_gfp_flags(__entry->gfp_flags), + __entry->memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg) ); #endif /* CONFIG_MEMCG */ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed), + TP_ARGS(nr_reclaimed, memcg), TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) + __field( u64, memcg_id ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed) + TP_printk("nr_reclaimed=%lu memcg_id=%llu", + __entry->nr_reclaimed, + __entry->memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); #endif /* CONFIG_MEMCG */ @@ -210,39 +217,42 @@ TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, long nr_objects_to_shrink, unsigned long cache_items, unsigned long long delta, unsigned long total_scan, - int priority), + int priority, struct mem_cgroup *memcg), TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, - priority), + priority, memcg), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) - __field(int, nid) __field(long, nr_objects_to_shrink) __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) __field(int, priority) + __field(int, nid) + __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; - __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; __entry->priority = priority; + __entry->nid = sc->nid; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", + TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, + __entry->memcg_id, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, @@ -253,35 +263,38 @@ TRACE_EVENT(mm_shrink_slab_start, TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, - long unused_scan_cnt, long new_scan_cnt, long total_scan), + long unused_scan_cnt, long new_scan_cnt, long total_scan, struct mem_cgroup *memcg), TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, - total_scan), + total_scan, memcg), TP_STRUCT__entry( __field(struct shrinker *, shr) - __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) - __field(int, retval) __field(long, total_scan) + __field(int, nid) + __field(int, retval) + __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; - __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; - __entry->retval = shrinker_retval; __entry->total_scan = total_scan; + __entry->nid = nid; + __entry->retval = shrinker_retval; + __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, __entry->nid, + __entry->memcg_id, __entry->unused_scan, __entry->new_scan, __entry->total_scan, @@ -514,9 +527,9 @@ TRACE_EVENT(mm_vmscan_node_reclaim_begin, DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg) ); TRACE_EVENT(mm_vmscan_throttled, diff --git a/mm/shrinker.c b/mm/shrinker.c index 94646ee0af63..0f90d63afdeb 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -410,7 +410,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); + freeable, delta, total_scan, priority, + shrinkctl->memcg); /* * Normally, we should not scan less than batch_size objects in one @@ -461,7 +462,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan, + shrinkctl->memcg); return freed; } diff --git a/mm/vmscan.c b/mm/vmscan.c index d531040a3593..2c954d370048 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6582,11 +6582,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return 1; set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); + trace_mm_vmscan_direct_reclaim_begin(sc.gfp_mask, order, 0); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed, 0); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -6615,8 +6615,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.gfp_mask); + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.gfp_mask, + sc.order, + memcg); /* * NOTE: Although we can get the priority field, using it @@ -6627,7 +6628,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, */ shrink_lruvec(lruvec, &sc); - trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed, memcg); *nr_scanned = sc.nr_scanned; @@ -6663,13 +6664,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); + trace_mm_vmscan_memcg_reclaim_begin(sc.gfp_mask, 0, memcg); noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); memalloc_noreclaim_restore(noreclaim_flag); - trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed, memcg); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -7643,7 +7644,7 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, delayacct_freepages_end(); psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed, 0); return sc->nr_reclaimed; } From 77a9c445b668765129f877d3c0d08ec4dc3ce77b Mon Sep 17 00:00:00 2001 From: Thomas Ballasi Date: Mon, 16 Mar 2026 09:09:08 -0700 Subject: [PATCH 086/369] mm: vmscan: add PIDs to vmscan tracepoints The changes aims at adding additionnal tracepoints variables to help debuggers attribute them to specific processes. Link: https://lkml.kernel.org/r/20260316160908.42727-4-tballasi@linux.microsoft.com Signed-off-by: Thomas Ballasi Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Steven Rostedt (Google) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c9e637c10f96..4445a8d9218d 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -140,10 +140,12 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("order=%d gfp_flags=%s memcg_id=%llu", + TP_printk("order=%d gfp_flags=%s pid=%d memcg_id=%llu %s", __entry->order, show_gfp_flags(__entry->gfp_flags), - __entry->memcg_id) + __entry->ent.pid, + __entry->memcg_id, + __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, @@ -185,9 +187,11 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("nr_reclaimed=%lu memcg_id=%llu", + TP_printk("nr_reclaimed=%lu pid=%d memcg_id=%llu %s", __entry->nr_reclaimed, - __entry->memcg_id) + __entry->ent.pid, + __entry->memcg_id, + __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, @@ -248,17 +252,19 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", + TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d %s", __entry->shrink, __entry->shr, __entry->nid, + __entry->ent.pid, __entry->memcg_id, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, __entry->delta, __entry->total_scan, - __entry->priority) + __entry->priority, + __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_shrink_slab_end, @@ -290,15 +296,17 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->memcg_id = mem_cgroup_id(memcg); ), - TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d %s", __entry->shrink, __entry->shr, __entry->nid, + __entry->ent.pid, __entry->memcg_id, __entry->unused_scan, __entry->new_scan, __entry->total_scan, - __entry->retval) + __entry->retval, + __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_vmscan_lru_isolate, From 1fb3d8c20bfadbbe2d9e5de18074de9282a52b5f Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Tue, 24 Feb 2026 22:21:01 +0800 Subject: [PATCH 087/369] mm/mmu_gather: replace IPI with synchronize_rcu() when batch allocation fails When freeing page tables, we try to batch them. If batch allocation fails (GFP_NOWAIT), __tlb_remove_table_one() immediately frees the one without batching. On !CONFIG_PT_RECLAIM, the fallback sends an IPI to all CPUs via tlb_remove_table_sync_one(). It disrupts all CPUs even when only a single process is unmapping memory. IPI broadcast was reported to hurt RT workloads[1]. tlb_remove_table_sync_one() synchronizes with lockless page-table walkers (e.g. GUP-fast) that rely on IRQ disabling. These walkers use local_irq_disable(), which is also an RCU read-side critical section. This patch introduces tlb_remove_table_sync_rcu() which uses RCU grace period (synchronize_rcu()) instead of IPI broadcast. This provides the same guarantee as IPI but without disrupting all CPUs. Since batch allocation already failed, we are in a slow path where sleeping is acceptable - we are in process context (unmap_region, exit_mmap) with only mmap_lock held. tlb_remove_table_sync_one() is retained for other callers (e.g., khugepaged after pmdp_collapse_flush(), tlb_finish_mmu() when tlb->fully_unshared_tables) that are not slow paths. Converting those may require different approaches such as targeted IPIs. Link: https://lore.kernel.org/linux-mm/1b27a3fa-359a-43d0-bdeb-c31341749367@kernel.org/ [1] Link: https://lore.kernel.org/linux-mm/20260202150957.GD1282955@noisy.programming.kicks-ass.net/ Link: https://lore.kernel.org/linux-mm/dfdfeac9-5cd5-46fc-a5c1-9ccf9bd3502a@intel.com/ Link: https://lore.kernel.org/linux-mm/bc489455-bb18-44dc-8518-ae75abda6bec@kernel.org/ Link: https://lkml.kernel.org/r/20260224142101.20500-1-lance.yang@linux.dev Signed-off-by: Lance Yang Suggested-by: Peter Zijlstra (Intel) Suggested-by: Dave Hansen Suggested-by: David Hildenbrand (Arm) Acked-by: David Hildenbrand (Arm) Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Nicholas Piggin Cc: Nick Piggin Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 4 ++++ mm/mmu_gather.c | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 4aeac0c3d3f0..bdcc2778ac64 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -251,6 +251,8 @@ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) void tlb_remove_table_sync_one(void); +void tlb_remove_table_sync_rcu(void); + #else #ifdef tlb_needs_table_invalidate @@ -259,6 +261,8 @@ void tlb_remove_table_sync_one(void); static inline void tlb_remove_table_sync_one(void) { } +static inline void tlb_remove_table_sync_rcu(void) { } + #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index fe5b6a031717..3985d856de7f 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -296,6 +296,25 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) call_rcu(&batch->rcu, tlb_remove_table_rcu); } +/** + * tlb_remove_table_sync_rcu - synchronize with software page-table walkers + * + * Like tlb_remove_table_sync_one() but uses RCU grace period instead of IPI + * broadcast. Use in slow paths where sleeping is acceptable. + * + * Software/Lockless page-table walkers use local_irq_disable(), which is also + * an RCU read-side critical section. synchronize_rcu() waits for all such + * sections, providing the same guarantee as tlb_remove_table_sync_one() but + * without disrupting all CPUs with IPIs. + * + * Do not use for freeing memory. Use RCU callbacks instead to avoid latency + * spikes. + */ +void tlb_remove_table_sync_rcu(void) +{ + synchronize_rcu(); +} + #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) @@ -339,7 +358,7 @@ static inline void __tlb_remove_table_one(void *table) #else static inline void __tlb_remove_table_one(void *table) { - tlb_remove_table_sync_one(); + tlb_remove_table_sync_rcu(); __tlb_remove_table(table); } #endif /* CONFIG_PT_RECLAIM */ From db3df34e5bdd5383e011f113d5ea2199e6c875f2 Mon Sep 17 00:00:00 2001 From: Youngjun Park Date: Thu, 26 Feb 2026 10:07:39 +0900 Subject: [PATCH 088/369] MAINTAINERS: add Youngjun Park as reviewer for SWAP Recently, I have been actively contributing to the swap subsystem through works such as swap-tier patches and flash friendly swap proposal. During this process, I have consistently reviewed swap table code, some other patches and fixed several bugs. As I am already CC'd on many patches and maintaining active interest in ongoing developments, I would like to officially add myself as a reviewer. I am committed to contributing to the kernel community with greater responsibility. Link: https://lkml.kernel.org/r/20260226010739.3773838-1-youngjun.park@lge.com Signed-off-by: Youngjun Park Reviewed-by: Barry Song Acked-by: Kairui Song Acked-by: Chris Li Acked-by: Baoquan He Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7d10988cbc62..7049d85c586e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16902,6 +16902,7 @@ R: Kemeng Shi R: Nhat Pham R: Baoquan He R: Barry Song +R: Youngjun Park L: linux-mm@kvack.org S: Maintained F: Documentation/mm/swap-table.rst From 03375203e1da8f4782ec5cd7023eb1b4adfba739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 25 Feb 2026 19:38:44 +0100 Subject: [PATCH 089/369] mm: do not allocate shrinker info with cgroup.memory=nokmem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There'd be no work for memcg-aware shrinkers when kernel memory is not accounted per cgroup, so we can skip allocating per memcg shrinker data. This saves some memory, avoids holding shrinker_mutex with O(nr_memcgs) and saves work in shrink_slab_memcg(). Then there are SHRINKER_NONSLAB shrinkers which handle non-kernel memory so nokmem should not disable their per-memcg behavior. Such shrinkers (e.g. deferred_split_shrinker) still need access to per-memcg data (see also commit 0a432dcbeb32e ("mm: shrinker: make shrinker not depend on memcg kmem")). The savings with this patch come on container hosts that create many superblocks (each with own shrinker) but tracking and processing per-memcg data is pointless with nokmem (shrink_slab_memcg() is partially guarded with !memcg_kmem_online already). The patch uses "boottime" predicate mem_cgroup_kmem_disabled() (not memcg_kmem_online()) to avoid mistakenly un-MEMCG_AWARE-ing shrinkers registered before first non-root memcg is mkdir'd. [mkoutny@suse.com: update comment, per Qi Zheng] Link: https://lkml.kernel.org/r/20260309-cgroup-ml-nokmem-shrinker-v2-1-3e7a7eefb6c9@suse.com Link: https://lkml.kernel.org/r/20260225-cgroup-ml-nokmem-shrinker-v1-1-d703899bdda4@suse.com Signed-off-by: Michal Koutný Reviewed-by: Roman Gushchin Acked-by: Qi Zheng Reviewed-by: Muchun Song Cc: Dave Chinner Cc: Jan Kara Signed-off-by: Andrew Morton --- mm/shrinker.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/shrinker.c b/mm/shrinker.c index 0f90d63afdeb..c23086bccf4d 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -219,6 +219,8 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; + if (mem_cgroup_kmem_disabled() && !(shrinker->flags & SHRINKER_NONSLAB)) + return -ENOSYS; mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); @@ -721,6 +723,7 @@ non_memcg: * - non-memcg-aware shrinkers * - !CONFIG_MEMCG * - memcg is disabled by kernel command line + * - non-slab shrinkers: when memcg kmem is disabled */ size = sizeof(*shrinker->nr_deferred); if (flags & SHRINKER_NUMA_AWARE) From 2b8acf8450f577d3785dacfd616630b76dc8f88d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:13:58 +0000 Subject: [PATCH 090/369] mm: introduce vm_mmap_shadow_stack() as a helper for VM_SHADOW_STACK mappings Patch series "mm: arch/shstk: Common shadow stack mapping helper and VM_NOHUGEPAGE", v2. A series to extract the common shadow stack mmap into a separate helper for arm64, riscv and x86. This patch (of 5): arm64, riscv and x86 use a similar pattern for mapping the user shadow stack (cloned from x86). Extract this into a helper to facilitate code reuse. The call to do_mmap() from the new helper uses PROT_READ|PROT_WRITE prot bits instead of the PROT_READ with an explicit VM_WRITE vm_flag. The x86 intent was to avoid PROT_WRITE implying normal write since the shadow stack is not writable by normal stores. However, from a kernel perspective, the vma is writeable. Functionally there is no difference. Link: https://lkml.kernel.org/r/20260225161404.3157851-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/20260225161404.3157851-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Deepak Gupta Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Arm) Reviewed-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Alexandre Ghiti Cc: "Borislav Petkov (AMD)" Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Palmer Dabbelt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Dave Hansen Cc: Paul Walmsley Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/util.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..bb0cfe38ca19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3903,6 +3903,8 @@ extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, + unsigned long len, unsigned long flags); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 diff --git a/mm/util.c b/mm/util.c index b05ab6f97e11..51f7f417e91f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -618,6 +618,31 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK +/* + * Perform a userland memory mapping for a shadow stack into the current + * process address space. This is intended to be used by architectures that + * support user shadow stacks. + */ +unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + unsigned long ret, unused; + + flags |= MAP_ANONYMOUS | MAP_PRIVATE; + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags, + VM_SHADOW_STACK, 0, &unused, NULL); + mmap_write_unlock(mm); + + return ret; +} +#endif /* CONFIG_ARCH_HAS_USER_SHADOW_STACK */ + /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. From 845e0af36235d893c34218cb629618168f8f34b4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:13:59 +0000 Subject: [PATCH 091/369] arm64: gcs: use the new common vm_mmap_shadow_stack() helper Replace the arm64 map_shadow_stack() content with a call to vm_mmap_shadow_stack(). There is no functional change. Link: https://lkml.kernel.org/r/20260225161404.3157851-3-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reviewed-by: David Hildenbrand (Arm) Reviewed-by: Mark Brown Cc: Will Deacon Cc: Albert Ou Cc: Alexandre Ghiti Cc: "Borislav Petkov (AMD)" Cc: Dave Hansen Cc: Deepak Gupta Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arm64/mm/gcs.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index 04a23a497f20..680749611a9a 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -12,19 +12,7 @@ static unsigned long alloc_gcs(unsigned long addr, unsigned long size) { - int flags = MAP_ANONYMOUS | MAP_PRIVATE; - struct mm_struct *mm = current->mm; - unsigned long mapped_addr, unused; - - if (addr) - flags |= MAP_FIXED_NOREPLACE; - - mmap_write_lock(mm); - mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, - VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); - mmap_write_unlock(mm); - - return mapped_addr; + return vm_mmap_shadow_stack(addr, size, 0); } static unsigned long gcs_size(unsigned long size) From fecd446f0ca44e9b2c38efea99cfc54fd4517f75 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:14:00 +0000 Subject: [PATCH 092/369] riscv: shstk: use the new common vm_mmap_shadow_stack() helper Replace part of the allocate_shadow_stack() content with a call to vm_mmap_shadow_stack(). There is no functional change. Link: https://lkml.kernel.org/r/20260225161404.3157851-4-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Deepak Gupta Reviewed-by: David Hildenbrand (Arm) Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexandre Ghiti Cc: "Borislav Petkov (AMD)" Cc: Dave Hansen Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/kernel/usercfi.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index 1adba746f164..7e57f54dc5b2 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -230,17 +230,7 @@ int restore_user_shstk(struct task_struct *tsk, unsigned long shstk_ptr) static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size, unsigned long token_offset, bool set_tok) { - int flags = MAP_ANONYMOUS | MAP_PRIVATE; - struct mm_struct *mm = current->mm; - unsigned long populate; - - if (addr) - flags |= MAP_FIXED_NOREPLACE; - - mmap_write_lock(mm); - addr = do_mmap(NULL, addr, size, PROT_READ, flags, - VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL); - mmap_write_unlock(mm); + addr = vm_mmap_shadow_stack(addr, size, 0); if (!set_tok || IS_ERR_VALUE(addr)) goto out; From a515ffc9de96f86318bc44e2ba702c4b5fdbd5bb Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:14:01 +0000 Subject: [PATCH 093/369] x86: shstk: use the new common vm_mmap_shadow_stack() helper Replace part of the x86 alloc_shstk() content with a call to vm_mmap_shadow_stack(). There is no functional change. Link: https://lkml.kernel.org/r/20260225161404.3157851-5-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reviewed-by: Rick Edgecombe Tested-by: Rick Edgecombe Reviewed-by: David Hildenbrand (Arm) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Albert Ou Cc: Alexandre Ghiti Cc: Deepak Gupta Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/kernel/shstk.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 978232b6d48d..9725e7d89b1e 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -100,17 +100,9 @@ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) static unsigned long alloc_shstk(unsigned long addr, unsigned long size, unsigned long token_offset, bool set_res_tok) { - int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; - struct mm_struct *mm = current->mm; - unsigned long mapped_addr, unused; + unsigned long mapped_addr; - if (addr) - flags |= MAP_FIXED_NOREPLACE; - - mmap_write_lock(mm); - mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, - VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); - mmap_write_unlock(mm); + mapped_addr = vm_mmap_shadow_stack(addr, size, MAP_ABOVE4G); if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) goto out; From 3efb9800557b855b493dc98d22c5e57974ac1593 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Feb 2026 16:14:02 +0000 Subject: [PATCH 094/369] mm: do not map the shadow stack as THP The default shadow stack size allocated on first prctl() for the main thread or subsequently on clone() is either half of RLIMIT_STACK or half of a thread's stack size (for arm64). Both of these are likely to be suitable for a THP allocation and the kernel is more aggressive in creating such mappings. However, it does not make much sense to use a huge page. It didn't make sense for the normal stacks either, see commit c4608d1bf7c6 ("mm: mmap: map MAP_STACK to VM_NOHUGEPAGE"). Force VM_NOHUGEPAGE when allocating/mapping the shadow stack. As per commit 7190b3c8bd2b ("mm: mmap: map MAP_STACK to VM_NOHUGEPAGE only if THP is enabled"), only pass this flag if TRANSPARENT_HUGEPAGE is enabled as not to confuse CRIU tools. Link: https://lkml.kernel.org/r/20260225161404.3157851-6-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Deepak Gupta Reviewed-by: Mark Brown Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: "Borislav Petkov (AMD)" Cc: Dave Hansen Cc: "Edgecombe, Rick P" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/util.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 51f7f417e91f..419cb81ab353 100644 --- a/mm/util.c +++ b/mm/util.c @@ -629,14 +629,18 @@ unsigned long vm_mmap_shadow_stack(unsigned long addr, unsigned long len, { struct mm_struct *mm = current->mm; unsigned long ret, unused; + vm_flags_t vm_flags = VM_SHADOW_STACK; flags |= MAP_ANONYMOUS | MAP_PRIVATE; if (addr) flags |= MAP_FIXED_NOREPLACE; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + vm_flags |= VM_NOHUGEPAGE; + mmap_write_lock(mm); ret = do_mmap(NULL, addr, len, PROT_READ | PROT_WRITE, flags, - VM_SHADOW_STACK, 0, &unused, NULL); + vm_flags, 0, &unused, NULL); mmap_write_unlock(mm); return ret; From da735962d05c4e7ffc68e02c0cb2459f837a0f51 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 25 Feb 2026 21:36:05 +0100 Subject: [PATCH 095/369] kfence: add kfence.fault parameter Add kfence.fault parameter to control the behavior when a KFENCE error is detected (similar in spirit to kasan.fault=). The supported modes for kfence.fault= are: - report: print the error report and continue (default). - oops: print the error report and oops. - panic: print the error report and panic. In particular, the 'oops' mode offers a trade-off between no mitigation on report and panicking outright (if panic_on_oops is not set). Link: https://lkml.kernel.org/r/20260225203639.3159463-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 6 +++ Documentation/dev-tools/kfence.rst | 7 +++ mm/kfence/core.c | 23 ++++++--- mm/kfence/kfence.h | 16 +++++- mm/kfence/report.c | 49 +++++++++++++++++-- 5 files changed, 89 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 03a550630644..a4aca9fab160 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2959,6 +2959,12 @@ Kernel parameters Format: Default: CONFIG_KFENCE_DEFERRABLE + kfence.fault= [MM,KFENCE] Controls the behavior when a KFENCE + error is detected. + report - print the error report and continue (default). + oops - print the error report and oops. + panic - print the error report and panic. + kfence.sample_interval= [MM,KFENCE] KFENCE's sample interval in milliseconds. Format: diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst index 541899353865..b03d1201ddae 100644 --- a/Documentation/dev-tools/kfence.rst +++ b/Documentation/dev-tools/kfence.rst @@ -81,6 +81,13 @@ tables being allocated. Error reports ~~~~~~~~~~~~~ +The boot parameter ``kfence.fault`` can be used to control the behavior when a +KFENCE error is detected: + +- ``kfence.fault=report``: Print the error report and continue (default). +- ``kfence.fault=oops``: Print the error report and oops. +- ``kfence.fault=panic``: Print the error report and panic. + A typical out-of-bounds access looks like this:: ================================================================== diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 7393957f9a20..9eba46212edf 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -51,7 +51,7 @@ /* === Data ================================================================= */ -static bool kfence_enabled __read_mostly; +bool kfence_enabled __read_mostly; static bool disabled_by_warn __read_mostly; unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; @@ -336,6 +336,7 @@ out: static check_canary_attributes bool check_canary_byte(u8 *addr) { struct kfence_metadata *meta; + enum kfence_fault fault; unsigned long flags; if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr))) @@ -345,8 +346,9 @@ static check_canary_attributes bool check_canary_byte(u8 *addr) meta = addr_to_metadata((unsigned long)addr); raw_spin_lock_irqsave(&meta->lock, flags); - kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); + fault = kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); raw_spin_unlock_irqrestore(&meta->lock, flags); + kfence_handle_fault(fault); return false; } @@ -525,11 +527,14 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z raw_spin_lock_irqsave(&meta->lock, flags); if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) { + enum kfence_fault fault; + /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); - kfence_report_error((unsigned long)addr, false, NULL, meta, - KFENCE_ERROR_INVALID_FREE); + fault = kfence_report_error((unsigned long)addr, false, NULL, meta, + KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); + kfence_handle_fault(fault); return; } @@ -831,7 +836,8 @@ static void kfence_check_all_canary(void) static int kfence_check_canary_callback(struct notifier_block *nb, unsigned long reason, void *arg) { - kfence_check_all_canary(); + if (READ_ONCE(kfence_enabled)) + kfence_check_all_canary(); return NOTIFY_OK; } @@ -1266,6 +1272,7 @@ bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs struct kfence_metadata *to_report = NULL; unsigned long unprotected_page = 0; enum kfence_error_type error_type; + enum kfence_fault fault; unsigned long flags; if (!is_kfence_address((void *)addr)) @@ -1324,12 +1331,14 @@ out: if (to_report) { raw_spin_lock_irqsave(&to_report->lock, flags); to_report->unprotected_page = unprotected_page; - kfence_report_error(addr, is_write, regs, to_report, error_type); + fault = kfence_report_error(addr, is_write, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ - kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); + fault = kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); } + kfence_handle_fault(fault); + return kfence_unprotect(addr); /* Unprotect and let access proceed. */ } diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index f9caea007246..1f618f9b0d12 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -16,6 +16,8 @@ #include "../slab.h" /* for struct kmem_cache */ +extern bool kfence_enabled; + /* * Get the canary byte pattern for @addr. Use a pattern that varies based on the * lower 3 bits of the address, to detect memory corruptions with higher @@ -140,8 +142,18 @@ enum kfence_error_type { KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; -void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, - const struct kfence_metadata *meta, enum kfence_error_type type); +enum kfence_fault { + KFENCE_FAULT_NONE, + KFENCE_FAULT_REPORT, + KFENCE_FAULT_OOPS, + KFENCE_FAULT_PANIC, +}; + +enum kfence_fault +kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type); + +void kfence_handle_fault(enum kfence_fault fault); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta) __must_hold(&meta->lock); diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 787e87c26926..d548536864b1 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -7,9 +7,12 @@ #include +#include +#include #include #include #include +#include #include #include #include @@ -29,6 +32,26 @@ #define ARCH_FUNC_PREFIX "" #endif +static enum kfence_fault kfence_fault __ro_after_init = KFENCE_FAULT_REPORT; + +static int __init early_kfence_fault(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "report")) + kfence_fault = KFENCE_FAULT_REPORT; + else if (!strcmp(arg, "oops")) + kfence_fault = KFENCE_FAULT_OOPS; + else if (!strcmp(arg, "panic")) + kfence_fault = KFENCE_FAULT_PANIC; + else + return -EINVAL; + + return 0; +} +early_param("kfence.fault", early_kfence_fault); + /* Helper function to either print to a seq_file or to console. */ __printf(2, 3) static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) @@ -189,8 +212,9 @@ static const char *get_access_type(bool is_write) return str_write_read(is_write); } -void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, - const struct kfence_metadata *meta, enum kfence_error_type type) +enum kfence_fault +kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type) { unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; @@ -206,7 +230,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) - return; + return KFENCE_FAULT_NONE; /* * Because we may generate reports in printk-unfriendly parts of the @@ -282,6 +306,25 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); + + return kfence_fault; +} + +void kfence_handle_fault(enum kfence_fault fault) +{ + switch (fault) { + case KFENCE_FAULT_NONE: + case KFENCE_FAULT_REPORT: + break; + case KFENCE_FAULT_OOPS: + BUG(); + break; + case KFENCE_FAULT_PANIC: + /* Disable KFENCE to avoid recursion if check_on_panic is set. */ + WRITE_ONCE(kfence_enabled, false); + panic("kfence.fault=panic set ...\n"); + break; + } } #ifdef CONFIG_PRINTK From ec106365394dc6c4e9ecf00842186d367dcc955a Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Wed, 25 Feb 2026 17:38:56 -0500 Subject: [PATCH 096/369] mm/vmalloc: export clear_vm_uninitialized_flag() Patch series "Fix KASAN support for KHO restored vmalloc regions". When KHO restores a vmalloc area, it maps existing physical pages into a newly allocated virtual memory area. However, because these areas were not properly unpoisoned, KASAN would treat any access to the restored region as out-of-bounds, as seen in the following trace: BUG: KASAN: vmalloc-out-of-bounds in kho_test_restore_data.isra.0+0x17b/0x2cd Read of size 8 at addr ffffc90000025000 by task swapper/0/1 [...] Call Trace: [...] kasan_report+0xe8/0x120 kho_test_restore_data.isra.0+0x17b/0x2cd kho_test_init+0x15a/0x1f0 do_one_initcall+0xd5/0x4b0 The fix involves deferring KASAN's default poisoning by using the VM_UNINITIALIZED flag during allocation, manually unpoisoning the memory once it is correctly mapped, and then clearing the uninitialized flag using a newly exported helper. This patch (of 2): Make clear_vm_uninitialized_flag() available to other parts of the kernel that need to manage vmalloc areas manually, such as KHO for restoring vmallocs. Link: https://lkml.kernel.org/r/20260225220223.1695350-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20260225223857.1714801-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: Pratyush Yadav (Google) Cc: Alexander Graf Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- mm/internal.h | 2 ++ mm/vmalloc.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 39ab37bb0e1d..2daa6a744172 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1469,6 +1469,8 @@ int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, } #endif +void clear_vm_uninitialized_flag(struct vm_struct *vm); + int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dda97c3799e..b2c2ed650840 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3183,7 +3183,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } -static void clear_vm_uninitialized_flag(struct vm_struct *vm) +void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, From 019fc36872374db6fd35e118c9e935374404bfbf Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Wed, 25 Feb 2026 17:38:57 -0500 Subject: [PATCH 097/369] kho: fix KASAN support for restored vmalloc regions Restored vmalloc regions are currently not properly marked for KASAN, causing KASAN to treat accesses to these regions as out-of-bounds. Fix this by properly unpoisoning the restored vmalloc area using kasan_unpoison_vmalloc(). This requires setting the VM_UNINITIALIZED flag during the initial area allocation and clearing it after the pages have been mapped and unpoisoned, using the clear_vm_uninitialized_flag() helper. Link: https://lkml.kernel.org/r/20260225223857.1714801-3-pasha.tatashin@soleen.com Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations") Signed-off-by: Pasha Tatashin Reported-by: Pratyush Yadav Reviewed-by: Pratyush Yadav (Google) Tested-by: Pratyush Yadav (Google) Cc: Alexander Graf Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 410098bae0bf..747a35107c84 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1077,6 +1078,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) { struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_PROT_NORMAL; unsigned int align, order, shift, vm_flags; unsigned long total_pages, contig_pages; unsigned long addr, size; @@ -1128,7 +1130,8 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) goto err_free_pages_array; area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, - vm_flags, VMALLOC_START, VMALLOC_END, + vm_flags | VM_UNINITIALIZED, + VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); if (!area) @@ -1143,6 +1146,13 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) area->nr_pages = total_pages; area->pages = pages; + if (vm_flags & VM_ALLOC) + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; + + area->addr = kasan_unpoison_vmalloc(area->addr, total_pages * PAGE_SIZE, + kasan_flags); + clear_vm_uninitialized_flag(area); + return area->addr; err_free_vm_area: From cbf56f9981014ee48ae9b9e2254f31d1642b8f8f Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:25 -0500 Subject: [PATCH 098/369] mm: remove stray references to struct pagevec Patch series "mm: Remove stray references to pagevec", v2. struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Remove any stray references to it and rename relevant files and macros accordingly. While at it, remove unnecessary #includes of pagevec.h (now folio_batch.h) in .c files. There are probably more of these that could be removed in .h files, but those are more complex to verify. This patch (of 4): struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Remove remaining forward declarations and change __folio_batch_release()'s declaration to match its definition. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-0-716868cc2d11@columbia.edu Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-1-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand (Arm) Acked-by: Chris Li Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/afs/internal.h | 1 - fs/f2fs/f2fs.h | 2 -- include/linux/pagevec.h | 2 +- include/linux/swap.h | 2 -- 4 files changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 009064b8d661..599353c33337 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -31,7 +31,6 @@ #define AFS_CELL_MAX_ADDRS 15 -struct pagevec; struct afs_call; struct afs_vnode; struct afs_server_probe; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb34e864d0ef..d9e8531a5301 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -28,8 +28,6 @@ #include #include -struct pagevec; - #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 63be5a451627..007affabf335 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -93,7 +93,7 @@ static inline struct folio *folio_batch_next(struct folio_batch *fbatch) return fbatch->folios[fbatch->i++]; } -void __folio_batch_release(struct folio_batch *pvec); +void __folio_batch_release(struct folio_batch *fbatch); static inline void folio_batch_release(struct folio_batch *fbatch) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 0effe3cc50f5..4b1f13b5bbad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -20,8 +20,6 @@ struct notifier_block; struct bio; -struct pagevec; - #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ From ab5193e919bbc2577bf404983b2c0ee3c6d3ef83 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:26 -0500 Subject: [PATCH 099/369] fs: remove unncessary pagevec.h includes Remove unused pagevec.h includes from .c files. These were found with the following command: grep -rl '#include.*pagevec\.h' --include='*.c' | while read f; do grep -qE 'PAGEVEC_SIZE|folio_batch' "$f" || echo "$f" done There are probably more removal candidates in .h files, but those are more complex to analyze. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-2-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Reviewed-by: Jan Kara Acked-by: Zi Yan Acked-by: Chris Li Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Christian Brauner Cc: David Hildenbrand (Arm) Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/afs/write.c | 1 - fs/dax.c | 1 - fs/ext4/file.c | 1 - fs/ext4/page-io.c | 1 - fs/ext4/readpage.c | 1 - fs/f2fs/file.c | 1 - fs/mpage.c | 1 - fs/netfs/buffered_write.c | 1 - fs/nfs/blocklayout/blocklayout.c | 1 - fs/nfs/dir.c | 1 - fs/ocfs2/refcounttree.c | 1 - fs/smb/client/connect.c | 1 - fs/smb/client/file.c | 1 - 13 files changed, 13 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 93ad86ff3345..fcfed9d24e0a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include "internal.h" diff --git a/fs/dax.c b/fs/dax.c index b78cff9c91b3..a5237169b467 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f1dc5ce791a7..5e02f6cf653e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a8c95eee91b7..98da200d11c8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 830f3b8a321f..3c7aabde719c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -43,7 +43,6 @@ #include #include #include -#include #include "ext4.h" #include diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c8a2f17a8f11..c6b6a1465d08 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/mpage.c b/fs/mpage.c index 7dae5afc2b9e..e5285fbfcf09 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "internal.h" /* diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 22a4d61631c9..05ea5b0cc0e8 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -10,7 +10,6 @@ #include #include #include -#include #include "internal.h" static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index cb0a645aeb50..11f9f69cde61 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -36,7 +36,6 @@ #include #include /* struct bio */ #include -#include #include "../pnfs.h" #include "../nfs4session.h" diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2402f57c8e7d..0d276441206b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c1cdececdfa4..b4acd081bbc4 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 69b38f0ccf2b..ca1bc67eb23b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index a69e05f86d7e..148508e3a82f 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include From 4e1d77a8f382a0ef4dd7732bb1986c8143600def Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:27 -0500 Subject: [PATCH 100/369] folio_batch: rename pagevec.h to folio_batch.h struct pagevec was removed in commit 1e0877d58b1e ("mm: remove struct pagevec"). Rename include/linux/pagevec.h to reflect reality and update includes tree-wide. Add the new filename to MAINTAINERS explicitly, as it no longer matches the "include/linux/page[-_]*" pattern in MEMORY MANAGEMENT - CORE. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-3-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Acked-by: David Hildenbrand (Arm) Reviewed-by: Jan Kara Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Chris Li Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + drivers/gpu/drm/drm_gem.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 2 +- drivers/gpu/drm/i915/gt/intel_gtt.h | 2 +- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/tests/extent-io-tests.c | 2 +- fs/buffer.c | 2 +- fs/ceph/addr.c | 2 +- fs/ext4/inode.c | 2 +- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/compress.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/node.c | 2 +- fs/gfs2/aops.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/nilfs2/btree.c | 2 +- fs/nilfs2/page.c | 2 +- fs/nilfs2/segment.c | 2 +- fs/ramfs/file-nommu.c | 2 +- include/linux/{pagevec.h => folio_batch.h} | 8 ++++---- include/linux/folio_queue.h | 2 +- include/linux/iomap.h | 2 +- include/linux/sunrpc/svc.h | 2 +- include/linux/writeback.h | 2 +- mm/filemap.c | 2 +- mm/gup.c | 2 +- mm/memcontrol.c | 2 +- mm/mlock.c | 2 +- mm/page-writeback.c | 2 +- mm/page_alloc.c | 2 +- mm/shmem.c | 2 +- mm/swap.c | 2 +- mm/swap_state.c | 2 +- mm/truncate.c | 2 +- mm/vmscan.c | 2 +- 37 files changed, 40 insertions(+), 39 deletions(-) rename include/linux/{pagevec.h => folio_batch.h} (95%) diff --git a/MAINTAINERS b/MAINTAINERS index 7049d85c586e..7a1b94a4aea2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16653,6 +16653,7 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/linux/folio_batch.h F: include/linux/gfp.h F: include/linux/gfp_types.h F: include/linux/highmem.h diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 891c3bff5ae0..dc4534fb175c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 720a9ad39aa2..06543ae60706 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -3,7 +3,7 @@ * Copyright © 2014-2016 Intel Corporation */ -#include +#include #include #include #include diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index 9d3a3ad567a0..b54ee4f25af1 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index a99b4e45d26c..ffe5f24594c9 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 85199944c1eb..de40b8934725 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5f97a3d2a8d7..89649ef5107a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include "extent_io.h" diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index a0187d6163df..b2aacf846c8b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/buffer.c b/fs/buffer.c index 22b43642ba57..f3122160ee2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2090fc78529c..bbeafbc777ee 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..58f982885187 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6dd39b7de11a..0143365c07dc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 8c76400ba631..614e00b8ffdc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 338df7a2aea6..90e8ef625d82 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 74992fd9c9b6..ba0272314528 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include "f2fs.h" diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e79ad087512a..dae3dc4ee6f7 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 22c799000edb..2ec3e4231252 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index dd0c8e560ef6..b400cfcdc803 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "nilfs.h" #include "page.h" #include "btnode.h" diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 56c4da417b6a..a9d8aa65416f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include "nilfs.h" #include "page.h" diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 098a3bd103e0..6d62de64a309 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0f8e838ece07..2f79bcb89d2e 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/pagevec.h b/include/linux/folio_batch.h similarity index 95% rename from include/linux/pagevec.h rename to include/linux/folio_batch.h index 007affabf335..a2f3d3043f7e 100644 --- a/include/linux/pagevec.h +++ b/include/linux/folio_batch.h @@ -1,13 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * include/linux/pagevec.h + * include/linux/folio_batch.h * * In many places it is efficient to batch an operation up against multiple * folios. A folio_batch is a container which is used for that. */ -#ifndef _LINUX_PAGEVEC_H -#define _LINUX_PAGEVEC_H +#ifndef _LINUX_FOLIO_BATCH_H +#define _LINUX_FOLIO_BATCH_H #include @@ -102,4 +102,4 @@ static inline void folio_batch_release(struct folio_batch *fbatch) } void folio_batch_remove_exceptionals(struct folio_batch *fbatch); -#endif /* _LINUX_PAGEVEC_H */ +#endif /* _LINUX_FOLIO_BATCH_H */ diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index adab609c972e..0d3765fa9d1d 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -14,7 +14,7 @@ #ifndef _LINUX_FOLIO_QUEUE_H #define _LINUX_FOLIO_QUEUE_H -#include +#include #include /* diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 99b7209dabd7..4551613cea2f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct address_space; struct fiemap_extent_info; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4dc14c7a711b..a11acf5cd63b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e530112c4b3a..62552a2ce5b9 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include struct bio; diff --git a/mm/filemap.c b/mm/filemap.c index 406cef06b684..7cc6607dc28f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/gup.c b/mm/gup.c index 8e7dc2c6ee73..ad9ded39609c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eb54cdf99624..87614cfc4a3e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/mlock.c b/mm/mlock.c index 2f699c3497a5..1a92d16f3684 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 601a5e048d12..1009bb042ba4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d88c8c67ac0b..74b603872f34 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/shmem.c b/mm/shmem.c index cfed6c3ff853..149fdb051170 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -61,7 +61,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #include #include #include -#include +#include #include #include #include diff --git a/mm/swap.c b/mm/swap.c index bb19ccbece46..2e517ede6561 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/swap_state.c b/mm/swap_state.c index 32d9d877bda8..a0c64db2b275 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/truncate.c b/mm/truncate.c index 12467c1bd711..df0b7a7e6aff 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/vmscan.c b/mm/vmscan.c index 2c954d370048..4ab461f8c65a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include #include From 511f04aac469a3ae04f7f2588101020aebb19c90 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Wed, 25 Feb 2026 18:44:28 -0500 Subject: [PATCH 101/369] folio_batch: rename PAGEVEC_SIZE to FOLIO_BATCH_SIZE struct pagevec no longer exists. Rename the macro appropriately. Link: https://lkml.kernel.org/r/20260225-pagevec_cleanup-v2-4-716868cc2d11@columbia.edu Signed-off-by: Tal Zussman Acked-by: David Hildenbrand (Arm) Reviewed-by: Jan Kara Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Chris Li Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/btrfs/extent_io.c | 4 ++-- include/linux/folio_batch.h | 6 +++--- include/linux/folio_queue.h | 6 +++--- mm/shmem.c | 4 ++-- mm/swap.c | 2 +- mm/swap_state.c | 2 +- mm/truncate.c | 6 +++--- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 89649ef5107a..070c8759b0b4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2095,13 +2095,13 @@ static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, struct eb_batch { unsigned int nr; unsigned int cur; - struct extent_buffer *ebs[PAGEVEC_SIZE]; + struct extent_buffer *ebs[FOLIO_BATCH_SIZE]; }; static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) { batch->ebs[batch->nr++] = eb; - return (batch->nr < PAGEVEC_SIZE); + return (batch->nr < FOLIO_BATCH_SIZE); } static inline void eb_batch_init(struct eb_batch *batch) diff --git a/include/linux/folio_batch.h b/include/linux/folio_batch.h index a2f3d3043f7e..b45946adc50b 100644 --- a/include/linux/folio_batch.h +++ b/include/linux/folio_batch.h @@ -12,7 +12,7 @@ #include /* 31 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 31 +#define FOLIO_BATCH_SIZE 31 struct folio; @@ -29,7 +29,7 @@ struct folio_batch { unsigned char nr; unsigned char i; bool percpu_pvec_drained; - struct folio *folios[PAGEVEC_SIZE]; + struct folio *folios[FOLIO_BATCH_SIZE]; }; /** @@ -58,7 +58,7 @@ static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { - return PAGEVEC_SIZE - fbatch->nr; + return FOLIO_BATCH_SIZE - fbatch->nr; } /** diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 0d3765fa9d1d..f6d5f1f127c9 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -29,12 +29,12 @@ */ struct folio_queue { struct folio_batch vec; /* Folios in the queue segment */ - u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ + u8 orders[FOLIO_BATCH_SIZE]; /* Order of each folio */ struct folio_queue *next; /* Next queue segment or NULL */ struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ -#if PAGEVEC_SIZE > BITS_PER_LONG +#if FOLIO_BATCH_SIZE > BITS_PER_LONG #error marks is not big enough #endif unsigned int rreq_id; @@ -70,7 +70,7 @@ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) */ static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) { - return PAGEVEC_SIZE; + return FOLIO_BATCH_SIZE; } /** diff --git a/mm/shmem.c b/mm/shmem.c index 149fdb051170..5e7dcf5bc5d3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1113,7 +1113,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; @@ -1510,7 +1510,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; int ret = 0; do { diff --git a/mm/swap.c b/mm/swap.c index 2e517ede6561..78b4aa811fc6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1018,7 +1018,7 @@ EXPORT_SYMBOL(folios_put_refs); void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; - int refs[PAGEVEC_SIZE]; + int refs[FOLIO_BATCH_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; diff --git a/mm/swap_state.c b/mm/swap_state.c index a0c64db2b275..6313b59d7eab 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -385,7 +385,7 @@ void free_folio_and_swap_cache(struct folio *folio) void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { struct folio_batch folios; - unsigned int refs[PAGEVEC_SIZE]; + unsigned int refs[FOLIO_BATCH_SIZE]; folio_batch_init(&folios); for (int i = 0; i < nr; i++) { diff --git a/mm/truncate.c b/mm/truncate.c index df0b7a7e6aff..2931d66c16d0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -369,7 +369,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ struct folio_batch fbatch; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; pgoff_t index; int i; struct folio *folio; @@ -534,7 +534,7 @@ EXPORT_SYMBOL(truncate_inode_pages_final); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed) { - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; @@ -672,7 +672,7 @@ failed: int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio_batch fbatch; pgoff_t index; int i; From c09fb53d293a05adf1b53c800273273e59413f39 Mon Sep 17 00:00:00 2001 From: gao xu Date: Thu, 26 Feb 2026 12:37:22 +0000 Subject: [PATCH 102/369] zram: use statically allocated compression algorithm names Currently, zram dynamically allocates memory for compressor algorithm names when they are set by the user. This requires careful memory management, including explicit `kfree` calls and special handling to avoid freeing statically defined default compressor names. This patch refactors the way zram handles compression algorithm names. Instead of storing dynamically allocated copies, `zram->comp_algs` will now store pointers directly to the static name strings defined within the `zcomp_ops` backend structures, thereby removing the need for conditional `kfree` calls. Link: https://lkml.kernel.org/r/5bb2e9318d124dbcb2b743dcdce6a950@honor.com Signed-off-by: gao xu Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- drivers/block/zram/zcomp.c | 9 +++++++-- drivers/block/zram/zcomp.h | 2 +- drivers/block/zram/zram_drv.c | 28 +++++----------------------- 3 files changed, 13 insertions(+), 26 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index a771a8ecc540..974c4691887e 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -84,9 +84,14 @@ static const struct zcomp_ops *lookup_backend_ops(const char *comp) return backends[i]; } -bool zcomp_available_algorithm(const char *comp) +const char *zcomp_lookup_backend_name(const char *comp) { - return lookup_backend_ops(comp) != NULL; + const struct zcomp_ops *backend = lookup_backend_ops(comp); + + if (backend) + return backend->name; + + return NULL; } /* show available compressors */ diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index eacfd3f7d61d..81a0f3f6ff48 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -80,7 +80,7 @@ struct zcomp { int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node); int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node); ssize_t zcomp_available_show(const char *comp, char *buf, ssize_t at); -bool zcomp_available_algorithm(const char *comp); +const char *zcomp_lookup_backend_name(const char *comp); struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params); void zcomp_destroy(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index af679375b193..990d391847f4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1621,43 +1621,29 @@ static void zram_debugfs_unregister(struct zram *zram) {}; static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) { - /* Do not free statically defined compression algorithms */ - if (zram->comp_algs[prio] != default_compressor) - kfree(zram->comp_algs[prio]); - zram->comp_algs[prio] = alg; } static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) { - char *compressor; + const char *alg; size_t sz; sz = strlen(buf); if (sz >= ZRAM_MAX_ALGO_NAME_SZ) return -E2BIG; - compressor = kstrdup(buf, GFP_KERNEL); - if (!compressor) - return -ENOMEM; - - /* ignore trailing newline */ - if (sz > 0 && compressor[sz - 1] == '\n') - compressor[sz - 1] = 0x00; - - if (!zcomp_available_algorithm(compressor)) { - kfree(compressor); + alg = zcomp_lookup_backend_name(buf); + if (!alg) return -EINVAL; - } guard(rwsem_write)(&zram->dev_lock); if (init_done(zram)) { - kfree(compressor); pr_info("Can't change algorithm for initialized device\n"); return -EBUSY; } - comp_algorithm_set(zram, prio, compressor); + comp_algorithm_set(zram, prio, alg); return 0; } @@ -2840,12 +2826,8 @@ static void zram_destroy_comps(struct zram *zram) zram->num_active_comps--; } - for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { - /* Do not free statically defined compression algorithms */ - if (zram->comp_algs[prio] != default_compressor) - kfree(zram->comp_algs[prio]); + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) zram->comp_algs[prio] = NULL; - } zram_comp_params_reset(zram); } From a2c77ec320a99581e8272868ccfa53a7d7a7b168 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:39 +0000 Subject: [PATCH 103/369] mm: move MAX_FOLIO_ORDER definition to mmzone.h Patch series "mm: Eliminate fake head pages from vmemmap optimization", v7. This series removes "fake head pages" from the HugeTLB vmemmap optimization (HVO) by changing how tail pages encode their relationship to the head page. It simplifies compound_head() and page_ref_add_unless(). Both are in the hot path. Background ========== HVO reduces memory overhead by freeing vmemmap pages for HugeTLB pages and remapping the freed virtual addresses to a single physical page. Previously, all tail page vmemmap entries were remapped to the first vmemmap page (containing the head struct page), creating "fake heads" - tail pages that appear to have PG_head set when accessed through the deduplicated vmemmap. This required special handling in compound_head() to detect and work around fake heads, adding complexity and overhead to a very hot path. New Approach ============ For architectures/configs where sizeof(struct page) is a power of 2 (the common case), this series changes how position of the head page is encoded in the tail pages. Instead of storing a pointer to the head page, the ->compound_info (renamed from ->compound_head) now stores a mask. The mask can be applied to any tail page's virtual address to compute the head page address. Critically, all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. The key insight is that all tail pages of the same order now have identical compound_info values, regardless of which compound page they belong to. In v7, these shared tail pages are allocated per-zone. This ensures that zone information (stored in page->flags) is correct even for shared tail pages, removing the need for the special-casing in page_zonenum() proposed in earlier versions. To support per-zone shared pages for boot-allocated gigantic pages, the vmemmap population is deferred until zones are initialized. This simplifies the logic significantly and allows the removal of vmemmap_undo_hvo(). Benefits ======== 1. Simplified compound_head(): No fake head detection needed, can be implemented in a branchless manner. 2. Simplified page_ref_add_unless(): RCU protection removed since there's no race with fake head remapping. 3. Cleaner architecture: The shared tail pages are truly read-only and contain valid tail page metadata. If sizeof(struct page) is not power-of-2, there are no functional changes. HVO is not supported in this configuration. I had hoped to see performance improvement, but my testing thus far has shown either no change or only a slight improvement within the noise. Series Organization =================== Patch 1: Move MAX_FOLIO_ORDER definition to mmzone.h. Patches 2-4: Refactoring of field names and interfaces. Patches 5-6: Architecture alignment for LoongArch and RISC-V. Patch 7: Mask-based compound_head() implementation. Patch 8: Add memmap alignment checks. Patch 9: Branchless compound_head() optimization. Patch 10: Defer vmemmap population for bootmem hugepages. Patch 11: Refactor vmemmap_walk. Patch 12: x86 vDSO build fix. Patch 13: Eliminate fake heads with per-zone shared tail pages. Patches 14-16: Cleanup of fake head infrastructure. Patch 17: Documentation update. Patch 18: Use compound_head() in page_slab(). This patch (of 17): Move MAX_FOLIO_ORDER definition from mm.h to mmzone.h. This is preparation for adding the vmemmap_tails array to struct zone, which requires MAX_FOLIO_ORDER to be available in mmzone.h. Link: https://lkml.kernel.org/r/20260227194302.274384-1-kas@kernel.org Link: https://lkml.kernel.org/r/20260227194302.274384-2-kas@kernel.org Signed-off-by: Kiryl Shutsemau Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Acked-by: Muchun Song Acked-by: Usama Arif Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/mm.h | 31 ------------------------------- include/linux/mmzone.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index bb0cfe38ca19..4e999c21d89a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -2479,36 +2478,6 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) -/* - * We don't expect any folios that exceed buddy sizes (and consequently - * memory sections). - */ -#define MAX_FOLIO_ORDER MAX_PAGE_ORDER -#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -/* - * Only pages within a single memory section are guaranteed to be - * contiguous. By limiting folios to a single memory section, all folio - * pages are guaranteed to be contiguous. - */ -#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT -#elif defined(CONFIG_HUGETLB_PAGE) -/* - * There is no real limit on the folio size. We limit them to the maximum we - * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect - * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. - */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) -#else -/* - * Without hugetlb, gigantic folios that are bigger than a single PUD are - * currently impossible. - */ -#define MAX_FOLIO_ORDER PUD_ORDER -#endif - -#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) - /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db41b18a919d..4c481ec77da9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -23,6 +23,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -61,6 +62,36 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#elif defined(CONFIG_HUGETLB_PAGE) +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. + */ +#define MAX_FOLIO_ORDER PUD_ORDER +#endif + +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, From f0369fb13619569ba8564ce8d4fc9d385bbee8a2 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:40 +0000 Subject: [PATCH 104/369] mm: change the interface of prep_compound_tail() Instead of passing down the head page and tail page index, pass the tail and head pages directly, as well as the order of the compound page. This is a preparation for changing how the head position is encoded in the tail page. Link: https://lkml.kernel.org/r/20260227194302.274384-3-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++-- mm/hugetlb.c | 8 +++++--- mm/internal.h | 11 +++++------ mm/mm_init.c | 2 +- mm/page_alloc.c | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 415e9f2ef616..7729a4a28b44 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -870,9 +870,10 @@ static inline bool folio_test_large(const struct folio *folio) return folio_test_head(folio); } -static __always_inline void set_compound_head(struct page *page, struct page *head) +static __always_inline void set_compound_head(struct page *tail, + const struct page *head, unsigned int order) { - WRITE_ONCE(page->compound_head, (unsigned long)head + 1); + WRITE_ONCE(tail->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 327eaa4074d3..1d41fa3dd43e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3168,6 +3168,7 @@ found: /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, + struct hstate *h, unsigned long start_page_number, unsigned long end_page_number) { @@ -3176,6 +3177,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; + unsigned int order = huge_page_order(h); /* * As we marked all tail pages with memblock_reserved_mark_noinit(), @@ -3183,7 +3185,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, */ for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) { __init_single_page(page, pfn, zone, nid); - prep_compound_tail((struct page *)folio, pfn - head_pfn); + prep_compound_tail(page, &folio->page, order); set_page_count(page, 0); } } @@ -3203,7 +3205,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); - hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); + hugetlb_folio_init_tail_vmemmap(folio, h, 1, nr_pages); prep_compound_head(&folio->page, huge_page_order(h)); } @@ -3260,7 +3262,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, * time as this is early in boot and there should * be no contention. */ - hugetlb_folio_init_tail_vmemmap(folio, + hugetlb_folio_init_tail_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } diff --git a/mm/internal.h b/mm/internal.h index 2daa6a744172..9cfbd8e41914 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -897,13 +897,12 @@ static inline void prep_compound_head(struct page *page, unsigned int order) INIT_LIST_HEAD(&folio->_deferred_list); } -static inline void prep_compound_tail(struct page *head, int tail_idx) +static inline void prep_compound_tail(struct page *tail, + const struct page *head, unsigned int order) { - struct page *p = head + tail_idx; - - p->mapping = TAIL_MAPPING; - set_compound_head(p, head); - set_page_private(p, 0); + tail->mapping = TAIL_MAPPING; + set_compound_head(tail, head, order); + set_page_private(tail, 0); } void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); diff --git a/mm/mm_init.c b/mm/mm_init.c index f903747ca854..5b261f86ba6f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1110,7 +1110,7 @@ static void __ref memmap_init_compound(struct page *head, struct page *page = pfn_to_page(pfn); __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); - prep_compound_tail(head, pfn - head_pfn); + prep_compound_tail(page, head, order); set_page_count(page, 0); } prep_compound_head(head, order); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74b603872f34..11f9a0525a3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -744,7 +744,7 @@ void prep_compound_page(struct page *page, unsigned int order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) - prep_compound_tail(page, i); + prep_compound_tail(page + i, page, order); prep_compound_head(page, order); } From d50569612c29215c5d1c64f47a65604ed265d2e9 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:41 +0000 Subject: [PATCH 105/369] mm: rename the 'compound_head' field in the 'struct page' to 'compound_info' The 'compound_head' field in the 'struct page' encodes whether the page is a tail and where to locate the head page. Bit 0 is set if the page is a tail, and the remaining bits in the field point to the head page. As preparation for changing how the field encodes information about the head page, rename the field to 'compound_info'. Link: https://lkml.kernel.org/r/20260227194302.274384-4-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- .../admin-guide/kdump/vmcoreinfo.rst | 2 +- Documentation/mm/vmemmap_dedup.rst | 6 +++--- include/linux/mm_types.h | 20 +++++++++---------- include/linux/page-flags.h | 18 ++++++++--------- include/linux/types.h | 2 +- kernel/vmcore_info.c | 2 +- mm/page_alloc.c | 2 +- mm/slab.h | 2 +- mm/util.c | 2 +- 9 files changed, 28 insertions(+), 28 deletions(-) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 404a15f6782c..7663c610fe90 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -141,7 +141,7 @@ nodemask_t The size of a nodemask_t type. Used to compute the number of online nodes. -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_order|compound_head) +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_order|compound_info) ---------------------------------------------------------------------------------- User-space tools compute their values based on the offset of these diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index b4a55b6569fa..1863d88d2dcb 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -24,7 +24,7 @@ For each base page, there is a corresponding ``struct page``. Within the HugeTLB subsystem, only the first 4 ``struct page`` are used to contain unique information about a HugeTLB page. ``__NR_USED_SUBPAGE`` provides this upper limit. The only 'useful' information in the remaining ``struct page`` -is the compound_head field, and this field is the same for all tail pages. +is the compound_info field, and this field is the same for all tail pages. By removing redundant ``struct page`` for HugeTLB pages, memory can be returned to the buddy allocator for other uses. @@ -124,10 +124,10 @@ Here is how things look before optimization:: | | +-----------+ -The value of page->compound_head is the same for all tail pages. The first +The value of page->compound_info is the same for all tail pages. The first page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4 ``struct page`` necessary to describe the HugeTLB. The only use of the remaining -pages of ``struct page`` (page 1 to page 7) is to point to page->compound_head. +pages of ``struct page`` (page 1 to page 7) is to point to page->compound_info. Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page`` will be used for each HugeTLB page. This will allow us to free the remaining 7 pages to the buddy allocator. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cc8ae722886..7bc82a2b889f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -126,14 +126,14 @@ struct page { atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ - unsigned long compound_head; /* Bit zero is set */ + unsigned long compound_info; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ /* - * The first word is used for compound_head or folio + * The first word is used for compound_info or folio * pgmap */ - void *_unused_pgmap_compound_head; + void *_unused_pgmap_compound_info; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being @@ -409,7 +409,7 @@ struct folio { /* private: avoid cluttering the output */ /* For the Unevictable "LRU list" slot */ struct { - /* Avoid compound_head */ + /* Avoid compound_info */ void *__filler; /* public: */ unsigned int mlock_count; @@ -510,7 +510,7 @@ struct folio { FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); -FOLIO_MATCH(compound_head, lru); +FOLIO_MATCH(compound_info, lru); FOLIO_MATCH(__folio_index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); @@ -529,7 +529,7 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); -FOLIO_MATCH(compound_head, _head_1); +FOLIO_MATCH(compound_info, _head_1); FOLIO_MATCH(_mapcount, _mapcount_1); FOLIO_MATCH(_refcount, _refcount_1); #undef FOLIO_MATCH @@ -537,13 +537,13 @@ FOLIO_MATCH(_refcount, _refcount_1); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); -FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(compound_info, _head_2); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 3 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_3); -FOLIO_MATCH(compound_head, _head_3); +FOLIO_MATCH(compound_info, _head_3); #undef FOLIO_MATCH /** @@ -609,8 +609,8 @@ struct ptdesc { #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) TABLE_MATCH(flags, pt_flags); -TABLE_MATCH(compound_head, pt_list); -TABLE_MATCH(compound_head, _pt_pad_1); +TABLE_MATCH(compound_info, pt_list); +TABLE_MATCH(compound_info, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(__folio_index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7729a4a28b44..265a798295ff 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -213,7 +213,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( - * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * e.g. compound_info) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && @@ -223,7 +223,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * because the @page is a compound page composed with at least * two contiguous pages. */ - unsigned long head = READ_ONCE(page[1].compound_head); + unsigned long head = READ_ONCE(page[1].compound_info); if (likely(head & 1)) return (const struct page *)(head - 1); @@ -281,7 +281,7 @@ static __always_inline int page_is_fake_head(const struct page *page) static __always_inline unsigned long _compound_head(const struct page *page) { - unsigned long head = READ_ONCE(page->compound_head); + unsigned long head = READ_ONCE(page->compound_info); if (unlikely(head & 1)) return head - 1; @@ -320,13 +320,13 @@ static __always_inline unsigned long _compound_head(const struct page *page) static __always_inline int PageTail(const struct page *page) { - return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); + return READ_ONCE(page->compound_info) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags.f) || - READ_ONCE(page->compound_head) & 1; + READ_ONCE(page->compound_info) & 1; } #define PAGE_POISON_PATTERN -1l @@ -348,7 +348,7 @@ static const unsigned long *const_folio_flags(const struct folio *folio, { const struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); + VM_BUG_ON_PGFLAGS(page->compound_info & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } @@ -357,7 +357,7 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); + VM_BUG_ON_PGFLAGS(page->compound_info & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } @@ -873,12 +873,12 @@ static inline bool folio_test_large(const struct folio *folio) static __always_inline void set_compound_head(struct page *tail, const struct page *head, unsigned int order) { - WRITE_ONCE(tail->compound_head, (unsigned long)head + 1); + WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { - WRITE_ONCE(page->compound_head, 0); + WRITE_ONCE(page->compound_info, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/types.h b/include/linux/types.h index 7e71d260763c..608050dbca6a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -239,7 +239,7 @@ struct ustat { * * This guarantee is important for few reasons: * - future call_rcu_lazy() will make use of lower bits in the pointer; - * - the structure shares storage space in struct page with @compound_head, + * - the structure shares storage space in struct page with @compound_info, * which encode PageTail() in bit 0. The guarantee is needed to avoid * false-positive PageTail(). */ diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 8d82913223a1..94e4ef75b1b2 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -198,7 +198,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(page, compound_info); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLATMEM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 11f9a0525a3a..f4f9a98bb425 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -731,7 +731,7 @@ static inline bool pcp_allowed_order(unsigned int order) * The first PAGE_SIZE page is called the "head page" and have PG_head set. * * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded - * in bit 0 of page->compound_head. The rest of bits is pointer to head page. + * in bit 0 of page->compound_info. The rest of bits is pointer to head page. * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. diff --git a/mm/slab.h b/mm/slab.h index e9ab292acd22..0653cf5fd93a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -94,7 +94,7 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, flags); -SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ +SLAB_MATCH(compound_info, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG SLAB_MATCH(memcg_data, obj_exts); diff --git a/mm/util.c b/mm/util.c index 419cb81ab353..52400a3c5eb4 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1276,7 +1276,7 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page) again: memset(&ps->folio_snapshot, 0, sizeof(struct folio)); memcpy(&ps->page_snapshot, page, sizeof(*page)); - head = ps->page_snapshot.compound_head; + head = ps->page_snapshot.compound_info; if ((head & 1) == 0) { ps->idx = 0; foliop = (struct folio *)&ps->page_snapshot; From 67c79a5af051f57339ecf383d3f67e200741ce20 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:42 +0000 Subject: [PATCH 106/369] mm: move set/clear_compound_head() next to compound_head() Move set_compound_head() and clear_compound_head() to be adjacent to the compound_head() function in page-flags.h. These functions encode and decode the same compound_info field, so keeping them together makes it easier to verify their logic is consistent, especially when the encoding changes. Link: https://lkml.kernel.org/r/20260227194302.274384-5-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 265a798295ff..5c469d38dd69 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -290,6 +290,17 @@ static __always_inline unsigned long _compound_head(const struct page *page) #define compound_head(page) ((typeof(page))_compound_head(page)) +static __always_inline void set_compound_head(struct page *tail, + const struct page *head, unsigned int order) +{ + WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); +} + +static __always_inline void clear_compound_head(struct page *page) +{ + WRITE_ONCE(page->compound_info, 0); +} + /** * page_folio - Converts from page to folio. * @p: The page. @@ -870,17 +881,6 @@ static inline bool folio_test_large(const struct folio *folio) return folio_test_head(folio); } -static __always_inline void set_compound_head(struct page *tail, - const struct page *head, unsigned int order) -{ - WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); -} - -static __always_inline void clear_compound_head(struct page *page) -{ - WRITE_ONCE(page->compound_info, 0); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { From 476849b0fba4450f5adf22196bcff9c24c673bc4 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:43 +0000 Subject: [PATCH 107/369] riscv/mm: align vmemmap to maximal folio size The upcoming change to the HugeTLB vmemmap optimization (HVO) requires struct pages of the head page to be naturally aligned with regard to the folio size. Align vmemmap to the newly introduced MAX_FOLIO_VMEMMAP_ALIGN. Link: https://lkml.kernel.org/r/20260227194302.274384-6-kas@kernel.org Signed-off-by: Kiryl Shutsemau Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Hildenbrand (arm) Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/riscv/mm/init.c | 3 ++- include/linux/mmzone.h | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 017bad735d47..b5c50956bb8a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -63,7 +63,8 @@ phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); #ifdef CONFIG_SPARSEMEM_VMEMMAP -#define VMEMMAP_ADDR_ALIGN (1ULL << SECTION_SIZE_BITS) +#define VMEMMAP_ADDR_ALIGN max(1ULL << SECTION_SIZE_BITS, \ + MAX_FOLIO_VMEMMAP_ALIGN) unsigned long vmemmap_start_pfn __ro_after_init; EXPORT_SYMBOL(vmemmap_start_pfn); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4c481ec77da9..0bef68e41f19 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -92,6 +92,17 @@ #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) +/* + * HugeTLB Vmemmap Optimization (HVO) requires struct pages of the head page to + * be naturally aligned with regard to the folio size. + * + * HVO which is only active if the size of struct page is a power of 2. + */ +#define MAX_FOLIO_VMEMMAP_ALIGN \ + (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && \ + is_power_of_2(sizeof(struct page)) ? \ + MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, From 2969b42c8f994330fc020ec0d235aeb43bce317c Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:44 +0000 Subject: [PATCH 108/369] LoongArch/mm: align vmemmap to maximal folio size The upcoming change to the HugeTLB vmemmap optimization (HVO) requires struct pages of the head page to be naturally aligned with regard to the folio size. Align vmemmap to MAX_FOLIO_VMEMMAP_ALIGN. Link: https://lkml.kernel.org/r/20260227194302.274384-7-kas@kernel.org Signed-off-by: Kiryl Shutsemau Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Hildenbrand (arm) Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index a244de27a03e..155f70e93460 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -104,7 +104,8 @@ struct vm_area_struct; min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits) / 2) - PMD_SIZE - VMEMMAP_SIZE - KFENCE_AREA_SIZE) #endif -#define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK)) +#define VMEMMAP_ALIGN max(PMD_SIZE, MAX_FOLIO_VMEMMAP_ALIGN) +#define vmemmap ((struct page *)(ALIGN(VMALLOC_END, VMEMMAP_ALIGN))) #define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1) #define KFENCE_AREA_START (VMEMMAP_END + 1) From 8c846c879e226c312c2c7a7bc1e323779903530f Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:45 +0000 Subject: [PATCH 109/369] mm: rework compound_head() for power-of-2 sizeof(struct page) For tail pages, the kernel uses the 'compound_info' field to get to the head page. The bit 0 of the field indicates whether the page is a tail page, and if set, the remaining bits represent a pointer to the head page. For cases when size of struct page is power-of-2, change the encoding of compound_info to store a mask that can be applied to the virtual address of the tail page in order to access the head page. It is possible because struct page of the head page is naturally aligned with regards to order of the page. The significant impact of this modification is that all tail pages of the same order will now have identical 'compound_info', regardless of the compound page they are associated with. This paves the way for eliminating fake heads. The HugeTLB Vmemmap Optimization (HVO) creates fake heads and it is only applied when the sizeof(struct page) is power-of-2. Having identical tail pages allows the same page to be mapped into the vmemmap of all pages, maintaining memory savings without fake heads. If sizeof(struct page) is not power-of-2, there is no functional changes. Limit mask usage to HugeTLB vmemmap optimization (HVO) where it makes a difference. The approach with mask would work in the wider set of conditions, but it requires validating that struct pages are naturally aligned for all orders up to the MAX_FOLIO_ORDER, which can be tricky. Link: https://lkml.kernel.org/r/20260227194302.274384-8-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Arm) Acked-by: Usama Arif Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 81 ++++++++++++++++++++++++++++++++++---- mm/slab.h | 16 ++++++-- mm/util.c | 16 ++++++-- 3 files changed, 97 insertions(+), 16 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5c469d38dd69..43876b108f0a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,6 +198,29 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +/* + * For tail pages, if the size of struct page is power-of-2 ->compound_info + * encodes the mask that converts the address of the tail page address to + * the head page address. + * + * Otherwise, ->compound_info has direct pointer to head pages. + */ +static __always_inline bool compound_info_has_mask(void) +{ + /* + * Limit mask usage to HugeTLB vmemmap optimization (HVO) where it + * makes a difference. + * + * The approach with mask would work in the wider set of conditions, + * but it requires validating that struct pages are naturally aligned + * for all orders up to the MAX_FOLIO_ORDER, which can be tricky. + */ + if (!IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)) + return false; + + return is_power_of_2(sizeof(struct page)); +} + #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); @@ -207,6 +230,10 @@ DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { + /* Fake heads only exists if compound_info_has_mask() is true */ + if (!compound_info_has_mask()) + return page; + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; @@ -223,10 +250,14 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * because the @page is a compound page composed with at least * two contiguous pages. */ - unsigned long head = READ_ONCE(page[1].compound_info); + unsigned long info = READ_ONCE(page[1].compound_info); - if (likely(head & 1)) - return (const struct page *)(head - 1); + /* See set_compound_head() */ + if (likely(info & 1)) { + unsigned long p = (unsigned long)page; + + return (const struct page *)(p & info); + } } return page; } @@ -281,11 +312,26 @@ static __always_inline int page_is_fake_head(const struct page *page) static __always_inline unsigned long _compound_head(const struct page *page) { - unsigned long head = READ_ONCE(page->compound_info); + unsigned long info = READ_ONCE(page->compound_info); - if (unlikely(head & 1)) - return head - 1; - return (unsigned long)page_fixed_fake_head(page); + /* Bit 0 encodes PageTail() */ + if (!(info & 1)) + return (unsigned long)page_fixed_fake_head(page); + + /* + * If compound_info_has_mask() is false, the rest of compound_info is + * the pointer to the head page. + */ + if (!compound_info_has_mask()) + return info - 1; + + /* + * If compound_info_has_mask() is true the rest of the info encodes + * the mask that converts the address of the tail page to the head page. + * + * No need to clear bit 0 in the mask as 'page' always has it clear. + */ + return (unsigned long)page & info; } #define compound_head(page) ((typeof(page))_compound_head(page)) @@ -293,7 +339,26 @@ static __always_inline unsigned long _compound_head(const struct page *page) static __always_inline void set_compound_head(struct page *tail, const struct page *head, unsigned int order) { - WRITE_ONCE(tail->compound_info, (unsigned long)head + 1); + unsigned int shift; + unsigned long mask; + + if (!compound_info_has_mask()) { + WRITE_ONCE(tail->compound_info, (unsigned long)head | 1); + return; + } + + /* + * If the size of struct page is power-of-2, bits [shift:0] of the + * virtual address of compound head are zero. + * + * Calculate mask that can be applied to the virtual address of + * the tail page to get address of the head page. + */ + shift = order + order_base_2(sizeof(struct page)); + mask = GENMASK(BITS_PER_LONG - 1, shift); + + /* Bit 0 encodes PageTail() */ + WRITE_ONCE(tail->compound_info, mask | 1); } static __always_inline void clear_compound_head(struct page *page) diff --git a/mm/slab.h b/mm/slab.h index 0653cf5fd93a..ccbdbed18c05 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -131,11 +131,19 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist */ static inline struct slab *page_slab(const struct page *page) { - unsigned long head; + unsigned long info; + + info = READ_ONCE(page->compound_info); + if (info & 1) { + /* See compound_head() */ + if (compound_info_has_mask()) { + unsigned long p = (unsigned long)page; + page = (struct page *)(p & info); + } else { + page = (struct page *)(info - 1); + } + } - head = READ_ONCE(page->compound_head); - if (head & 1) - page = (struct page *)(head - 1); if (data_race(page->page_type >> 24) != PGTY_slab) page = NULL; diff --git a/mm/util.c b/mm/util.c index 52400a3c5eb4..ce7ae80047cf 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1266,7 +1266,7 @@ static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, */ void snapshot_page(struct page_snapshot *ps, const struct page *page) { - unsigned long head, nr_pages = 1; + unsigned long info, nr_pages = 1; struct folio *foliop; int loops = 5; @@ -1276,8 +1276,8 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page) again: memset(&ps->folio_snapshot, 0, sizeof(struct folio)); memcpy(&ps->page_snapshot, page, sizeof(*page)); - head = ps->page_snapshot.compound_info; - if ((head & 1) == 0) { + info = ps->page_snapshot.compound_info; + if (!(info & 1)) { ps->idx = 0; foliop = (struct folio *)&ps->page_snapshot; if (!folio_test_large(foliop)) { @@ -1288,7 +1288,15 @@ again: } foliop = (struct folio *)page; } else { - foliop = (struct folio *)(head - 1); + /* See compound_head() */ + if (compound_info_has_mask()) { + unsigned long p = (unsigned long)page; + + foliop = (struct folio *)(p & info); + } else { + foliop = (struct folio *)(info - 1); + } + ps->idx = folio_page_idx(foliop, page); } From 9f94db4c7eaa1737171ec80df20c33eab04c3703 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:46 +0000 Subject: [PATCH 110/369] mm/sparse: check memmap alignment for compound_info_has_mask() If page->compound_info encodes a mask, it is expected that vmemmap to be naturally aligned to the maximum folio size. Add a VM_WARN_ON_ONCE() to check the alignment. Link: https://lkml.kernel.org/r/20260227194302.274384-9-kas@kernel.org Signed-off-by: Kiryl Shutsemau Acked-by: Zi Yan Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Hildenbrand (arm) Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Signed-off-by: Andrew Morton --- mm/sparse.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/sparse.c b/mm/sparse.c index b5b2b6f7041b..dfabe554adf8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -600,6 +600,11 @@ void __init sparse_init(void) BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); memblocks_present(); + if (compound_info_has_mask()) { + VM_WARN_ON_ONCE(!IS_ALIGNED((unsigned long) pfn_to_page(0), + MAX_FOLIO_VMEMMAP_ALIGN)); + } + pnum_begin = first_present_section_nr(); nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); From 209e6d9eb13aaf1b6e0fc6f76afc00d055e5ba12 Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 27 Feb 2026 19:42:47 +0000 Subject: [PATCH 111/369] mm/hugetlb: defer vmemmap population for bootmem hugepages Currently, the vmemmap for bootmem-allocated gigantic pages is populated early in hugetlb_vmemmap_init_early(). However, the zone information is only available after zones are initialized. If it is later discovered that a page spans multiple zones, the HVO mapping must be undone and replaced with a normal mapping using vmemmap_undo_hvo(). Defer the actual vmemmap population to hugetlb_vmemmap_init_late(). At this stage, zones are already initialized, so it can be checked if the page is valid for HVO before deciding how to populate the vmemmap. This allows us to remove vmemmap_undo_hvo() and the complex logic required to rollback HVO mappings. In hugetlb_vmemmap_init_late(), if HVO population fails or if the zones are invalid, fall back to a normal vmemmap population. Postponing population until hugetlb_vmemmap_init_late() also makes zone information available from within vmemmap_populate_hvo(). Link: https://lkml.kernel.org/r/20260227194302.274384-10-kas@kernel.org Signed-off-by: Kiryl Shutsemau (Meta) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/hugetlb_vmemmap.c | 37 +++++++++++++++---------------- mm/sparse-vmemmap.c | 53 -------------------------------------------- 3 files changed, 18 insertions(+), 74 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4e999c21d89a..d7e53532a109 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4481,8 +4481,6 @@ int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); -int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, - unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_populate_print_last(void); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index a9280259e12a..935ec5829be9 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -790,7 +790,6 @@ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; - unsigned long start, end; struct huge_bootmem_page *m = NULL; void *map; @@ -808,14 +807,6 @@ void __init hugetlb_vmemmap_init_early(int nid) paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - if (vmemmap_populate_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) - continue; - - memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; @@ -850,28 +841,36 @@ void __init hugetlb_vmemmap_init_late(int nid) h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); + map = pfn_to_page(pfn); + start = (unsigned long)map; + end = start + nr_pages * sizeof(struct page); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. - * Remove it from the list, and undo HVO. + * Remove it from the list, and populate it normally. */ list_del(&m->list); - map = pfn_to_page(pfn); - - start = (unsigned long)map; - end = start + nr_pages * sizeof(struct page); - - vmemmap_undo_hvo(start, end, nid, - HUGETLB_VMEMMAP_RESERVE_SIZE); - nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; - } else + } + + if (vmemmap_populate_hvo(start, end, nid, + HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { + /* Fallback if HVO population fails */ + vmemmap_populate(start, end, nid, NULL); + nr_mmap = end - start; + } else { m->flags |= HUGE_BOOTMEM_ZONES_VALID; + nr_mmap = HUGETLB_VMEMMAP_RESERVE_SIZE; + } + + memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); } } #endif diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 37522d6cb398..032a81450838 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -302,59 +302,6 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, return vmemmap_populate_range(start, end, node, altmap, -1, 0); } -/* - * Undo populate_hvo, and replace it with a normal base page mapping. - * Used in memory init in case a HVO mapping needs to be undone. - * - * This can happen when it is discovered that a memblock allocated - * hugetlb page spans multiple zones, which can only be verified - * after zones have been initialized. - * - * We know that: - * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually - * allocated through memblock, and mapped. - * - * 2) The rest of the vmemmap pages are mirrors of the last head page. - */ -int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) -{ - unsigned long maddr, pfn; - pte_t *pte; - int headpages; - - /* - * Should only be called early in boot, so nothing will - * be accessing these page structures. - */ - WARN_ON(!early_boot_irqs_disabled); - - headpages = headsize >> PAGE_SHIFT; - - /* - * Clear mirrored mappings for tail page structs. - */ - for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pte_clear(&init_mm, maddr, pte); - } - - /* - * Clear and free mappings for head page and first tail page - * structs. - */ - for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) { - pte = virt_to_kpte(maddr); - pfn = pte_pfn(ptep_get(pte)); - pte_clear(&init_mm, maddr, pte); - memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE); - } - - flush_tlb_kernel_range(addr, end); - - return vmemmap_populate(addr, end, node, NULL); -} - /* * Write protect the mirrored tail page structs for HVO. This will be * called from the hugetlb code when gathering and initializing the From c0b495b91a47b6c5ee54cf00e620dbadeb884253 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:48 +0000 Subject: [PATCH 112/369] mm/hugetlb: refactor code around vmemmap_walk To prepare for removing fake head pages, the vmemmap_walk code is being reworked. The reuse_page and reuse_addr variables are being eliminated. There will no longer be an expectation regarding the reuse address in relation to the operated range. Instead, the caller will provide head and tail vmemmap pages. Currently, vmemmap_head and vmemmap_tail are set to the same page, but this will change in the future. The only functional change is that __hugetlb_vmemmap_optimize_folio() will abandon optimization if memory allocation fails. Link: https://lkml.kernel.org/r/20260227194302.274384-11-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Hildenbrand (arm) Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 226 +++++++++++++++++-------------------------- 1 file changed, 90 insertions(+), 136 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 935ec5829be9..3628fb5b2a28 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -25,8 +25,8 @@ * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. - * @reuse_page: the page which is reused for the tail vmemmap pages. - * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_head: the page to be installed as first in the vmemmap range + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking @@ -35,11 +35,13 @@ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); + unsigned long nr_walked; - struct page *reuse_page; - unsigned long reuse_addr; + struct page *vmemmap_head; + struct page *vmemmap_tail; struct list_head *vmemmap_pages; + /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ @@ -141,14 +143,7 @@ static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, { struct vmemmap_remap_walk *vmemmap_walk = walk->private; - /* - * The reuse_page is found 'first' in page table walking before - * starting remapping. - */ - if (!vmemmap_walk->reuse_page) - vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); - else - vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); + vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; @@ -208,18 +203,12 @@ static void free_vmemmap_page_list(struct list_head *list) static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { - /* - * Remap the tail pages as read-only to catch illegal write operation - * to the tail pages. - */ - pgprot_t pgprot = PAGE_KERNEL_RO; struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ - if (unlikely(addr == walk->reuse_addr)) { - pgprot = PAGE_KERNEL; - list_del(&walk->reuse_page->lru); + if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) { + list_del(&walk->vmemmap_head->lru); /* * Makes sure that preceding stores to the page contents from @@ -227,53 +216,50 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * write. */ smp_wmb(); + + entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL); + } else { + /* + * Remap the tail pages as read-only to catch illegal write + * operation to the tail pages. + */ + entry = mk_pte(walk->vmemmap_tail, PAGE_KERNEL_RO); } - entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } -/* - * How many struct page structs need to be reset. When we reuse the head - * struct page, the special metadata (e.g. page->flags or page->mapping) - * cannot copy to the tail struct page structs. The invalid value will be - * checked in the free_tail_page_prepare(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 4 (one - * head struct page struct and three tail struct page structs) struct page - * structs. - */ -#define NR_RESET_STRUCT_PAGE 4 - -static inline void reset_struct_pages(struct page *start) -{ - struct page *from = start + NR_RESET_STRUCT_PAGE; - - BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); - memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); -} - static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { - pgprot_t pgprot = PAGE_KERNEL; struct page *page; - void *to; - - BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); + struct page *from, *to; page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); + + /* + * Initialize tail pages in the newly allocated vmemmap page. + * + * There is folio-scope metadata that is encoded in the first few + * tail pages. + * + * Use the value last tail page in the page with the head page + * to initialize the rest of tail pages. + */ + from = compound_head((struct page *)addr) + + PAGE_SIZE / sizeof(struct page) - 1; to = page_to_virt(page); - copy_page(to, (void *)walk->reuse_addr); - reset_struct_pages(to); + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++) + *to = *from; /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); } /** @@ -283,33 +269,28 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. - * @reuse: reuse address. - * * Return: %0 on success, negative error code otherwise. */ -static int vmemmap_remap_split(unsigned long start, unsigned long end, - unsigned long reuse) +static int vmemmap_remap_split(unsigned long start, unsigned long end) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; - /* See the comment in the vmemmap_remap_free(). */ - BUG_ON(start - reuse != PAGE_SIZE); - - return vmemmap_remap_range(reuse, end, &walk); + return vmemmap_remap_range(start, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) - * to the page which @reuse is mapped to, then free vmemmap - * which the range are mapped to. + * to use @vmemmap_head/tail, then free vmemmap which + * the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. - * @reuse: reuse address. + * @vmemmap_head: the page to be installed as first in the vmemmap range + * @vmemmap_tail: the page to be installed as non-first in the vmemmap range * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags @@ -317,69 +298,38 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse, + struct page *vmemmap_head, + struct page *vmemmap_tail, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, - .reuse_addr = reuse, + .vmemmap_head = vmemmap_head, + .vmemmap_tail = vmemmap_tail, .vmemmap_pages = vmemmap_pages, .flags = flags, }; - int nid = page_to_nid((struct page *)reuse); - gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + + ret = vmemmap_remap_range(start, end, &walk); + if (!ret || !walk.nr_walked) + return ret; + + end = start + walk.nr_walked * PAGE_SIZE; /* - * Allocate a new head vmemmap page to avoid breaking a contiguous - * block of struct page memory when freeing it back to page allocator - * in free_vmemmap_page_list(). This will allow the likely contiguous - * struct page backing memory to be kept contiguous and allowing for - * more allocations of hugepages. Fallback to the currently - * mapped head page in case should it fail to allocate. + * vmemmap_pages contains pages from the previous vmemmap_remap_range() + * call which failed. These are pages which were removed from + * the vmemmap. They will be restored in the following call. */ - walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); - if (walk.reuse_page) { - copy_page(page_to_virt(walk.reuse_page), - (void *)walk.reuse_addr); - list_add(&walk.reuse_page->lru, vmemmap_pages); - memmap_pages_add(1); - } + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .vmemmap_pages = vmemmap_pages, + .flags = 0, + }; - /* - * In order to make remapping routine most efficient for the huge pages, - * the routine of vmemmap page table walking has the following rules - * (see more details from the vmemmap_pte_range()): - * - * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) - * should be continuous. - * - The @reuse address is part of the range [@reuse, @end) that we are - * walking which is passed to vmemmap_remap_range(). - * - The @reuse address is the first in the complete range. - * - * So we need to make sure that @start and @reuse meet the above rules. - */ - BUG_ON(start - reuse != PAGE_SIZE); - - ret = vmemmap_remap_range(reuse, end, &walk); - if (ret && walk.nr_walked) { - end = reuse + walk.nr_walked * PAGE_SIZE; - /* - * vmemmap_pages contains pages from the previous - * vmemmap_remap_range call which failed. These - * are pages which were removed from the vmemmap. - * They will be restored in the following call. - */ - walk = (struct vmemmap_remap_walk) { - .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, - .vmemmap_pages = vmemmap_pages, - .flags = 0, - }; - - vmemmap_remap_range(reuse, end, &walk); - } + vmemmap_remap_range(start, end, &walk); return ret; } @@ -416,29 +366,24 @@ out: * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. - * @reuse: reuse address. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, unsigned long flags) + unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, .flags = flags, }; - /* See the comment in the vmemmap_remap_free(). */ - BUG_ON(start - reuse != PAGE_SIZE); - if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; - return vmemmap_remap_range(reuse, end, &walk); + return vmemmap_remap_range(start, end, &walk); } DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); @@ -455,8 +400,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; - unsigned long vmemmap_reuse; + unsigned long vmemmap_start, vmemmap_end; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); @@ -467,18 +411,18 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, if (flags & VMEMMAP_SYNCHRONIZE_RCU) synchronize_rcu(); + vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); - vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * The pages which the vmemmap virtual address range [@vmemmap_start, - * @vmemmap_end) are mapped to are freed to the buddy allocator, and - * the range is mapped to the page which @vmemmap_reuse is mapped to. + * @vmemmap_end) are mapped to are freed to the buddy allocator. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags); if (!ret) { folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); @@ -566,9 +510,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct list_head *vmemmap_pages, unsigned long flags) { - int ret = 0; - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; - unsigned long vmemmap_reuse; + unsigned long vmemmap_start, vmemmap_end; + struct page *vmemmap_head, *vmemmap_tail; + int nid, ret = 0; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); @@ -593,18 +537,30 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, */ folio_set_hugetlb_vmemmap_optimized(folio); + nid = folio_nid(folio); + vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); + if (!vmemmap_head) { + ret = -ENOMEM; + goto out; + } + + copy_page(page_to_virt(vmemmap_head), folio); + list_add(&vmemmap_head->lru, vmemmap_pages); + memmap_pages_add(1); + + vmemmap_tail = vmemmap_head; + vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); - vmemmap_reuse = vmemmap_start; - vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) - * to the page which @vmemmap_reuse is mapped to. Add pages previously - * mapping the range to vmemmap_pages list so that they can be freed by - * the caller. + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end). + * Add pages previously mapping the range to vmemmap_pages list so that + * they can be freed by the caller. */ - ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, + ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, + vmemmap_head, vmemmap_tail, vmemmap_pages, flags); +out: if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); folio_clear_hugetlb_vmemmap_optimized(folio); @@ -633,21 +589,19 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { - unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; - unsigned long vmemmap_reuse; + unsigned long vmemmap_start, vmemmap_end; if (!vmemmap_should_optimize_folio(h, folio)) return 0; + vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); - vmemmap_reuse = vmemmap_start; - vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ - return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); + return vmemmap_remap_split(vmemmap_start, vmemmap_end); } static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, From 76351f2f0c27b36600caa8767bc384745f51c7de Mon Sep 17 00:00:00 2001 From: "Kiryl Shutsemau (Meta)" Date: Fri, 27 Feb 2026 19:42:49 +0000 Subject: [PATCH 113/369] x86/vdso: undefine CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP for vdso32 The 32-bit VDSO build on x86_64 uses fake_32bit_build.h to undefine various kernel configuration options that are not suitable for the VDSO context or may cause build issues when including kernel headers. Undefine CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP in fake_32bit_build.h to prepare for change in HugeTLB Vmemmap Optimization. Link: https://lkml.kernel.org/r/20260227194302.274384-12-kas@kernel.org Signed-off-by: Kiryl Shutsemau (Meta) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/x86/entry/vdso/vdso32/fake_32bit_build.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h index db1b15f686e3..bc3e549795c3 100644 --- a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h +++ b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h @@ -11,6 +11,7 @@ #undef CONFIG_PGTABLE_LEVELS #undef CONFIG_ILLEGAL_POINTER_VALUE #undef CONFIG_SPARSEMEM_VMEMMAP +#undef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP #undef CONFIG_NR_CPUS #undef CONFIG_PARAVIRT_XXL From 622026e87c4019e609010811757e31193cc23847 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:50 +0000 Subject: [PATCH 114/369] mm/hugetlb: remove fake head pages HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most vmemmap pages for huge pages and remapping the freed range to a single page containing the struct page metadata. With the new mask-based compound_info encoding (for power-of-2 struct page sizes), all tail pages of the same order are now identical regardless of which compound page they belong to. This means the tail pages can be truly shared without fake heads. Allocate a single page of initialized tail struct pages per zone per order in the vmemmap_tails[] array in struct zone. All huge pages of that order in the zone share this tail page, mapped read-only into their vmemmap. The head page remains unique per huge page. Redefine MAX_FOLIO_ORDER using ilog2(). The define has to produce a compile-constant as it is used to specify vmemmap_tail array size. For some reason, compiler is not able to solve get_order() at compile-time, but ilog2() works. Avoid PUD_ORDER to define MAX_FOLIO_ORDER as it adds dependency to which generates hard-to-break include loop. This eliminates fake heads while maintaining the same memory savings, and simplifies compound_head() by removing fake head detection. Link: https://lkml.kernel.org/r/20260227194302.274384-13-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +- include/linux/mmzone.h | 19 +++++++++-- mm/hugetlb_vmemmap.c | 73 ++++++++++++++++++++++++++++++++++++++++-- mm/internal.h | 9 ++++++ mm/sparse-vmemmap.c | 57 +++++++++++++++++++++++++++------ 5 files changed, 146 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e53532a109..19619e5efeba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4479,7 +4479,8 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); -int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, +int vmemmap_populate_hvo(unsigned long start, unsigned long end, + unsigned int order, struct zone *zone, unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0bef68e41f19..5c3ae0348754 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -81,13 +81,17 @@ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#ifdef CONFIG_64BIT +#define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT) +#else +#define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT) +#endif #else /* * Without hugetlb, gigantic folios that are bigger than a single PUD are * currently impossible. */ -#define MAX_FOLIO_ORDER PUD_ORDER +#define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT) #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) @@ -103,6 +107,14 @@ is_power_of_2(sizeof(struct page)) ? \ MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) +/* + * vmemmap optimization (like HVO) is only possible for page orders that fill + * two or more pages with struct pages. + */ +#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page))) +#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1) +#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, @@ -1113,6 +1125,9 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + struct page *vmemmap_tails[NR_VMEMMAP_TAILS]; +#endif } ____cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 3628fb5b2a28..92330f172eb7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -19,6 +19,7 @@ #include #include "hugetlb_vmemmap.h" +#include "internal.h" /** * struct vmemmap_remap_walk - walk vmemmap page table @@ -505,6 +506,32 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio * return true; } +static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; + struct page *tail, *p; + int node = zone_to_nid(zone); + + tail = READ_ONCE(zone->vmemmap_tails[idx]); + if (likely(tail)) + return tail; + + tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!tail) + return NULL; + + p = page_to_virt(tail); + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) + init_compound_tail(p + i, NULL, order, zone); + + if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { + __free_page(tail); + tail = READ_ONCE(zone->vmemmap_tails[idx]); + } + + return tail; +} + static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, @@ -520,6 +547,11 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, if (!vmemmap_should_optimize_folio(h, folio)) return ret; + nid = folio_nid(folio); + vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); + if (!vmemmap_tail) + return -ENOMEM; + static_branch_inc(&hugetlb_optimize_vmemmap_key); if (flags & VMEMMAP_SYNCHRONIZE_RCU) @@ -537,7 +569,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, */ folio_set_hugetlb_vmemmap_optimized(folio); - nid = folio_nid(folio); vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); if (!vmemmap_head) { ret = -ENOMEM; @@ -548,7 +579,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, list_add(&vmemmap_head->lru, vmemmap_pages); memmap_pages_add(1); - vmemmap_tail = vmemmap_head; vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -776,11 +806,26 @@ void __init hugetlb_vmemmap_init_early(int nid) } } +static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) +{ + struct zone *zone; + enum zone_type zone_type; + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + zone = &NODE_DATA(nid)->node_zones[zone_type]; + if (zone_spans_pfn(zone, pfn)) + return zone; + } + + return NULL; +} + void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; + struct zone *zone = NULL; struct hstate *h; void *map; @@ -814,7 +859,12 @@ void __init hugetlb_vmemmap_init_late(int nid) continue; } - if (vmemmap_populate_hvo(start, end, nid, + if (!zone || !zone_spans_pfn(zone, pfn)) + zone = pfn_to_zone(nid, pfn); + if (WARN_ON_ONCE(!zone)) + continue; + + if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { /* Fallback if HVO population fails */ vmemmap_populate(start, end, nid, NULL); @@ -842,10 +892,27 @@ static const struct ctl_table hugetlb_vmemmap_sysctls[] = { static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; + struct zone *zone; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); + for_each_zone(zone) { + for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { + struct page *tail, *p; + unsigned int order; + + tail = zone->vmemmap_tails[i]; + if (!tail) + continue; + + order = i + VMEMMAP_TAIL_MIN_ORDER; + p = page_to_virt(tail); + for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) + init_compound_tail(p + j, NULL, order, zone); + } + } + for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); diff --git a/mm/internal.h b/mm/internal.h index 9cfbd8e41914..84167b0570c9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -905,6 +905,15 @@ static inline void prep_compound_tail(struct page *tail, set_page_private(tail, 0); } +static inline void init_compound_tail(struct page *tail, + const struct page *head, unsigned int order, struct zone *zone) +{ + atomic_set(&tail->_mapcount, -1); + set_page_node(tail, zone_to_nid(zone)); + set_page_zone(tail, zone_idx(zone)); + prep_compound_tail(tail, head, order); +} + void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 032a81450838..842ed2f0bce6 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -325,16 +325,54 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end, } } -/* - * Populate vmemmap pages HVO-style. The first page contains the head - * page and needed tail pages, the other ones are mirrors of the first - * page. - */ -int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + struct page *p, *tail; + unsigned int idx; + int node = zone_to_nid(zone); + + if (WARN_ON_ONCE(order < VMEMMAP_TAIL_MIN_ORDER)) + return NULL; + if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER)) + return NULL; + + idx = order - VMEMMAP_TAIL_MIN_ORDER; + tail = zone->vmemmap_tails[idx]; + if (tail) + return tail; + + /* + * Only allocate the page, but do not initialize it. + * + * Any initialization done here will be overwritten by memmap_init(). + * + * hugetlb_vmemmap_init() will take care of initialization after + * memmap_init(). + */ + + p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + + tail = virt_to_page(p); + zone->vmemmap_tails[idx] = tail; + + return tail; +} + +int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, + unsigned int order, struct zone *zone, + unsigned long headsize) { - pte_t *pte; unsigned long maddr; + struct page *tail; + pte_t *pte; + int node = zone_to_nid(zone); + + tail = vmemmap_get_tail(order, zone); + if (!tail) + return -ENOMEM; for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) { pte = vmemmap_populate_address(maddr, node, NULL, -1, 0); @@ -346,8 +384,9 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, * Reuse the last page struct page mapped above for the rest. */ return vmemmap_populate_range(maddr, end, node, NULL, - pte_pfn(ptep_get(pte)), 0); + page_to_pfn(tail), 0); } +#endif void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) From 32c440d67e6cd96a715007d0e62eb970b0c49abc Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:51 +0000 Subject: [PATCH 115/369] mm: drop fake head checks With fake head pages eliminated in the previous commit, remove the supporting infrastructure: - page_fixed_fake_head(): no longer needed to detect fake heads; - page_is_fake_head(): no longer needed; - page_count_writable(): no longer needed for RCU protection; - RCU read_lock in page_ref_add_unless(): no longer needed; This substantially simplifies compound_head() and page_ref_add_unless(), removing both branches and RCU overhead from these hot paths. RCU was required to serialize allocation of hugetlb page against get_page_unless_zero() and prevent writing to read-only fake head. It is redundant without fake heads. See bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers") for more details. synchronize_rcu() in mm/hugetlb_vmemmap.c will be removed by a separate patch. Link: https://lkml.kernel.org/r/20260227194302.274384-14-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Acked-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 93 ++------------------------------------ include/linux/page_ref.h | 8 +--- 2 files changed, 4 insertions(+), 97 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 43876b108f0a..b8eef2181598 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -221,102 +221,15 @@ static __always_inline bool compound_info_has_mask(void) return is_power_of_2(sizeof(struct page)); } -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); -/* - * Return the real head page struct iff the @page is a fake head page, otherwise - * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. - */ -static __always_inline const struct page *page_fixed_fake_head(const struct page *page) -{ - /* Fake heads only exists if compound_info_has_mask() is true */ - if (!compound_info_has_mask()) - return page; - - if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) - return page; - - /* - * Only addresses aligned with PAGE_SIZE of struct page may be fake head - * struct page. The alignment check aims to avoid access the fields ( - * e.g. compound_info) of the @page[1]. It can avoid touch a (possibly) - * cold cacheline in some cases. - */ - if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && - test_bit(PG_head, &page->flags.f)) { - /* - * We can safely access the field of the @page[1] with PG_head - * because the @page is a compound page composed with at least - * two contiguous pages. - */ - unsigned long info = READ_ONCE(page[1].compound_info); - - /* See set_compound_head() */ - if (likely(info & 1)) { - unsigned long p = (unsigned long)page; - - return (const struct page *)(p & info); - } - } - return page; -} - -static __always_inline bool page_count_writable(const struct page *page, int u) -{ - if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) - return true; - - /* - * The refcount check is ordered before the fake-head check to prevent - * the following race: - * CPU 1 (HVO) CPU 2 (speculative PFN walker) - * - * page_ref_freeze() - * synchronize_rcu() - * rcu_read_lock() - * page_is_fake_head() is false - * vmemmap_remap_pte() - * XXX: struct page[] becomes r/o - * - * page_ref_unfreeze() - * page_ref_count() is not zero - * - * atomic_add_unless(&page->_refcount) - * XXX: try to modify r/o struct page[] - * - * The refcount check also prevents modification attempts to other (r/o) - * tail pages that are not fake heads. - */ - if (atomic_read_acquire(&page->_refcount) == u) - return false; - - return page_fixed_fake_head(page) == page; -} -#else -static inline const struct page *page_fixed_fake_head(const struct page *page) -{ - return page; -} - -static inline bool page_count_writable(const struct page *page, int u) -{ - return true; -} -#endif - -static __always_inline int page_is_fake_head(const struct page *page) -{ - return page_fixed_fake_head(page) != page; -} - static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); /* Bit 0 encodes PageTail() */ if (!(info & 1)) - return (unsigned long)page_fixed_fake_head(page); + return (unsigned long)page; /* * If compound_info_has_mask() is false, the rest of compound_info is @@ -396,7 +309,7 @@ static __always_inline void clear_compound_head(struct page *page) static __always_inline int PageTail(const struct page *page) { - return READ_ONCE(page->compound_info) & 1 || page_is_fake_head(page); + return READ_ONCE(page->compound_info) & 1; } static __always_inline int PageCompound(const struct page *page) @@ -928,7 +841,7 @@ static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); - return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); + return test_bit(PG_head, &page->flags.f); } __SETPAGEFLAG(Head, head, PF_ANY) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 544150d1d5fd..490d0ad6e56d 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -230,13 +230,7 @@ static inline int folio_ref_dec_return(struct folio *folio) static inline bool page_ref_add_unless(struct page *page, int nr, int u) { - bool ret = false; - - rcu_read_lock(); - /* avoid writing to the vmemmap area being remapped */ - if (page_count_writable(page, u)) - ret = atomic_add_unless(&page->_refcount, nr, u); - rcu_read_unlock(); + bool ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); From 01b1d0ffb6f7f37f466033cf167959946309ed3d Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:52 +0000 Subject: [PATCH 116/369] hugetlb: remove VMEMMAP_SYNCHRONIZE_RCU The VMEMMAP_SYNCHRONIZE_RCU flag triggered synchronize_rcu() calls to prevent a race between HVO remapping and page_ref_add_unless(). The race could occur when a speculative PFN walker tried to modify the refcount on a struct page that was in the process of being remapped to a fake head. With fake heads eliminated, page_ref_add_unless() no longer needs RCU protection. Remove the flag and synchronize_rcu() calls. Link: https://lkml.kernel.org/r/20260227194302.274384-15-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 92330f172eb7..fd1d5d5d12b4 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -47,8 +47,6 @@ struct vmemmap_remap_walk { #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) -/* synchronize_rcu() to avoid writes from page_ref_add_unless() */ -#define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; @@ -409,9 +407,6 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; - if (flags & VMEMMAP_SYNCHRONIZE_RCU) - synchronize_rcu(); - vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -444,7 +439,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); + return __hugetlb_vmemmap_restore_folio(h, folio, 0); } /** @@ -467,14 +462,11 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct folio *folio, *t_folio; long restored = 0; long ret = 0; - unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); - /* only need to synchronize_rcu() once for each batch */ - flags &= ~VMEMMAP_SYNCHRONIZE_RCU; - if (ret) break; restored++; @@ -554,8 +546,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, static_branch_inc(&hugetlb_optimize_vmemmap_key); - if (flags & VMEMMAP_SYNCHRONIZE_RCU) - synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed @@ -613,7 +603,7 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -641,7 +631,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, struct folio *folio; int nr_to_optimize; LIST_HEAD(vmemmap_pages); - unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH; nr_to_optimize = 0; list_for_each_entry(folio, folio_list, lru) { @@ -694,8 +684,6 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, int ret; ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); - /* only need to synchronize_rcu() once for each batch */ - flags &= ~VMEMMAP_SYNCHRONIZE_RCU; /* * Pages to be freed may have been accumulated. If we From da3e2d1ca43de56a83a806237b6be7e91cf07052 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:53 +0000 Subject: [PATCH 117/369] mm/hugetlb: remove hugetlb_optimize_vmemmap_key static key The hugetlb_optimize_vmemmap_key static key was used to guard fake head detection in compound_head() and related functions. It allowed skipping the fake head checks entirely when HVO was not in use. With fake heads eliminated and the detection code removed, the static key serves no purpose. Remove its definition and all increment/decrement calls. Link: https://lkml.kernel.org/r/20260227194302.274384-16-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Acked-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 -- mm/hugetlb_vmemmap.c | 14 ++------------ 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b8eef2181598..f361bd6c814c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -221,8 +221,6 @@ static __always_inline bool compound_info_has_mask(void) return is_power_of_2(sizeof(struct page)); } -DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); - static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index fd1d5d5d12b4..4a077d231d3a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -385,9 +385,6 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, return vmemmap_remap_range(start, end, &walk); } -DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); -EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); - static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static int __init hugetlb_vmemmap_optimize_param(char *buf) { @@ -419,10 +416,8 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, flags); - if (!ret) { + if (!ret) folio_clear_hugetlb_vmemmap_optimized(folio); - static_branch_dec(&hugetlb_optimize_vmemmap_key); - } return ret; } @@ -544,8 +539,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, if (!vmemmap_tail) return -ENOMEM; - static_branch_inc(&hugetlb_optimize_vmemmap_key); - /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed @@ -581,10 +574,8 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, vmemmap_head, vmemmap_tail, vmemmap_pages, flags); out: - if (ret) { - static_branch_dec(&hugetlb_optimize_vmemmap_key); + if (ret) folio_clear_hugetlb_vmemmap_optimized(folio); - } return ret; } @@ -650,7 +641,6 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, register_page_bootmem_memmap(pfn_to_section_nr(spfn), &folio->page, HUGETLB_VMEMMAP_RESERVE_SIZE); - static_branch_inc(&hugetlb_optimize_vmemmap_key); continue; } From 66b2a3d9ae460934fef5fd588077730f483e8c8c Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:54 +0000 Subject: [PATCH 118/369] mm: remove the branch from compound_head() The compound_head() function is a hot path. For example, the zap path calls it for every leaf page table entry. Rewrite the helper function in a branchless manner to eliminate the risk of CPU branch misprediction. Link: https://lkml.kernel.org/r/20260227194302.274384-17-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f361bd6c814c..7223f6f4e2b4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -224,25 +224,32 @@ static __always_inline bool compound_info_has_mask(void) static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long info = READ_ONCE(page->compound_info); + unsigned long mask; + + if (!compound_info_has_mask()) { + /* Bit 0 encodes PageTail() */ + if (info & 1) + return info - 1; - /* Bit 0 encodes PageTail() */ - if (!(info & 1)) return (unsigned long)page; - - /* - * If compound_info_has_mask() is false, the rest of compound_info is - * the pointer to the head page. - */ - if (!compound_info_has_mask()) - return info - 1; + } /* * If compound_info_has_mask() is true the rest of the info encodes * the mask that converts the address of the tail page to the head page. * * No need to clear bit 0 in the mask as 'page' always has it clear. + * + * Let's do it in a branchless manner. */ - return (unsigned long)page & info; + + /* Non-tail: -1UL, Tail: 0 */ + mask = (info & 1) - 1; + + /* Non-tail: -1UL, Tail: info */ + mask |= info; + + return (unsigned long)page & mask; } #define compound_head(page) ((typeof(page))_compound_head(page)) From fed8676ca2b0195647046dfc955f86f53ab837f0 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:55 +0000 Subject: [PATCH 119/369] hugetlb: update vmemmap_dedup.rst Update the documentation regarding vmemmap optimization for hugetlb to reflect the changes in how the kernel maps the tail pages. Fake heads no longer exist. Remove their description. [kas@kernel.org: update vmemmap_dedup.rst] Link: https://lkml.kernel.org/r/20260302105630.303492-1-kas@kernel.org Link: https://lkml.kernel.org/r/20260227194302.274384-18-kas@kernel.org Signed-off-by: Kiryl Shutsemau Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand (Arm) Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/mm/vmemmap_dedup.rst | 60 +++++++++++++----------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index 1863d88d2dcb..9fa8642ded48 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -124,33 +124,35 @@ Here is how things look before optimization:: | | +-----------+ -The value of page->compound_info is the same for all tail pages. The first -page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4 -``struct page`` necessary to describe the HugeTLB. The only use of the remaining -pages of ``struct page`` (page 1 to page 7) is to point to page->compound_info. -Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page`` -will be used for each HugeTLB page. This will allow us to free the remaining -7 pages to the buddy allocator. +The first page of ``struct page`` (page 0) associated with the HugeTLB page +contains the 4 ``struct page`` necessary to describe the HugeTLB. The remaining +pages of ``struct page`` (page 1 to page 7) are tail pages. + +The optimization is only applied when the size of the struct page is a power +of 2. In this case, all tail pages of the same order are identical. See +compound_head(). This allows us to remap the tail pages of the vmemmap to a +shared, read-only page. The head page is also remapped to a new page. This +allows the original vmemmap pages to be freed. Here is how things look after remapping:: - HugeTLB struct pages(8 pages) page frame(8 pages) - +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ - | | | 0 | -------------> | 0 | - | | +-----------+ +-----------+ - | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ - | | +-----------+ | | | | | | - | | | 2 | -----------------+ | | | | | - | | +-----------+ | | | | | - | | | 3 | -------------------+ | | | | - | | +-----------+ | | | | - | | | 4 | ---------------------+ | | | - | PMD | +-----------+ | | | - | level | | 5 | -----------------------+ | | - | mapping | +-----------+ | | - | | | 6 | -------------------------+ | - | | +-----------+ | - | | | 7 | ---------------------------+ + HugeTLB struct pages(8 pages) page frame (new) + +-----------+ ---virt_to_page---> +-----------+ mapping to +----------------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +----------------+ + | | | 1 | ------┐ + | | +-----------+ | + | | | 2 | ------┼ +----------------------------+ + | | +-----------+ | | A single, per-zone page | + | | | 3 | ------┼------> | frame shared among all | + | | +-----------+ | | hugepages of the same size | + | | | 4 | ------┼ +----------------------------+ + | | +-----------+ | + | | | 5 | ------┼ + | PMD | +-----------+ | + | level | | 6 | ------┼ + | mapping | +-----------+ | + | | | 7 | ------┘ | | +-----------+ | | | | @@ -172,16 +174,6 @@ The contiguous bit is used to increase the mapping size at the pmd and pte (last) level. So this type of HugeTLB page can be optimized only when its size of the ``struct page`` structs is greater than **1** page. -Notice: The head vmemmap page is not freed to the buddy allocator and all -tail vmemmap pages are mapped to the head vmemmap page frame. So we can see -more than one ``struct page`` struct with ``PG_head`` (e.g. 8 per 2 MB HugeTLB -page) associated with each HugeTLB page. The ``compound_head()`` can handle -this correctly. There is only **one** head ``struct page``, the tail -``struct page`` with ``PG_head`` are fake head ``struct page``. We need an -approach to distinguish between those two different types of ``struct page`` so -that ``compound_head()`` can return the real head ``struct page`` when the -parameter is the tail ``struct page`` but with ``PG_head``. - Device DAX ========== From 8231e4c040fb0ddd37d66093281d197fd5000297 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Fri, 27 Feb 2026 19:42:56 +0000 Subject: [PATCH 120/369] mm/slab: use compound_head() in page_slab() page_slab() contained an open-coded implementation of compound_head(). Replace the duplicated code with a direct call to compound_head(). Link: https://lkml.kernel.org/r/20260227194302.274384-19-kas@kernel.org Signed-off-by: Kiryl Shutsemau Acked-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka Cc: Albert Ou Cc: Alexandre Ghiti Cc: Baoquan He Cc: Christoph Lameter Cc: David Rientjes Cc: Frank van der Linden Cc: Harry Yoo Cc: Huacai Chen Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Roman Gushchin Cc: Usama Arif Cc: WANG Xuerui Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/slab.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index ccbdbed18c05..77242024e7d5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -131,19 +131,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist */ static inline struct slab *page_slab(const struct page *page) { - unsigned long info; - - info = READ_ONCE(page->compound_info); - if (info & 1) { - /* See compound_head() */ - if (compound_info_has_mask()) { - unsigned long p = (unsigned long)page; - page = (struct page *)(p & info); - } else { - page = (struct page *)(info - 1); - } - } - + page = compound_head(page); if (data_race(page->page_type >> 24) != PGTY_slab) page = NULL; From e7e1a26b8ddf0c8df117cc5b211a81ce635a0613 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 27 Feb 2026 09:06:20 -0800 Subject: [PATCH 121/369] mm/damon/core: set quota-score histogram with core filters Patch series "mm/damon/core: improve DAMOS quota efficiency for core layer filters". Improve two below problematic behaviors of DAMOS that makes it less efficient when core layer filters are used. DAMOS generates the under-quota regions prioritization-purpose access temperature histogram [1] with only the scheme target access pattern. The DAMOS filters are ignored on the histogram, and this can result in the scheme not applied to eligible regions. For working around this, users had to use separate DAMON contexts. The memory tiering approaches are such examples. DAMOS splits regions that intersect with address filters, so that only filtered-out part of the region is skipped. But, the implementation is skipping the other part of the region that is not filtered out, too. As a result, DAMOS can work slower than expected. Improve the two inefficient behaviors with two patches, respectively. Read the patches for more details about the problem and how those are fixed. This patch (of 2): The histogram for under-quota region prioritization [1] is made for all regions that are eligible for the DAMOS target access pattern. When there are DAMOS filters, the prioritization-threshold access temperature that generated from the histogram could be inaccurate. For example, suppose there are three regions. Each region is 1 GiB. The access temperature of the regions are 100, 50, and 0. And a DAMOS scheme that targets _any_ access temperature with quota 2 GiB is being used. The histogram will look like below: temperature size of regions having >=temperature temperature 0 3 GiB 50 2 GiB 100 1 GiB Based on the histogram and the quota (2 GiB), DAMOS applies the action to only the regions having >=50 temperature. This is all good. Let's suppose the region of temperature 50 is excluded by a DAMOS filter. Regardless of the filter, DAMOS will try to apply the action on only regions having >=50 temperature. Because the region of temperature 50 is filtered out, the action is applied to only the region of temperature 100. Worse yet, suppose the filter is excluding regions of temperature 50 and 100. Then no action is really applied to any region, while the region of temperature 0 is there. People used to work around this by utilizing multiple contexts, instead of the core layer DAMOS filters. For example, DAMON-based memory tiering approaches including the quota auto-tuning based one [2] are using a DAMON context per NUMA node. If the above explained issue is effectively alleviated, those can be configured again to run with single context and DAMOS filters for applying the promotion and demotion to only specific NUMA nodes. Alleviate the problem by checking core DAMOS filters when generating the histogram. The reason to check only core filters is the overhead. While core filters are usually for coarse-grained filtering (e.g., target/address filters for process, NUMA, zone level filtering), operation layer filters are usually for fine-grained filtering (e.g., for anon page). Doing this for operation layer filters would cause significant overhead. There is no known use case that is affected by the operation layer filters-distorted histogram problem, though. Do this for only core filters for now. We will revisit this for operation layer filters in future. We might be able to apply a sort of sampling based operation layer filtering. After this fix is applied, for the first case that there is a DAMOS filter excluding the region of temperature 50, the histogram will be like below: temperature size of regions having >=temperature temperature 0 2 GiB 100 1 GiB And DAMOS will set the temperature threshold as 0, allowing both regions of temperatures 0 and 100 be applied. For the second case that there is a DAMOS filter excluding the regions of temperature 50 and 100, the histogram will be like below: temperature size of regions having >=temperature temperature 0 1 GiB And DAMOS will set the temperature threshold as 0, allowing the region of temperature 0 be applied. [1] 'Prioritization' section of Documentation/mm/damon/design.rst [2] commit 0e1c773b501f ("mm/damon/core: introduce damos quota goal metrics for memory node utilization") Link: https://lkml.kernel.org/r/20260227170623.95384-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260227170623.95384-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 0e5ada441b05..722dcb8fff7a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2319,6 +2319,8 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; + if (damos_core_filter_out(c, t, r, s)) + continue; score = c->ops.get_scheme_score(c, r, s); c->regions_score_histogram[score] += damon_sz_region(r); From 1745ccbd2907db2bdaa843e4abccde4fdaccbe5d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 27 Feb 2026 09:06:21 -0800 Subject: [PATCH 122/369] mm/damon/core: do non-safe region walk on kdamond_apply_schemes() kdamond_apply_schemes() is using damon_for_each_region_safe(), which is safe for deallocation of the region inside the loop. However, the loop internal logic does not deallocate regions. Hence it is only wasting the next pointer. Also, it causes a problem. When an address filter is applied, and there is a region that intersects with the filter, the filter splits the region on the filter boundary. The intention is to let DAMOS apply action to only filtered-in address ranges. However, it is using damon_for_each_region_safe(), which sets the next region before the execution of the iteration. Hence, the region that split and now will be next to the previous region, is simply ignored. As a result, DAMOS applies the action to target regions bit slower than expected, when the address filter is used. Shouldn't be a big problem but definitely better to be fixed. damos_skip_charged_region() was working around the issue using a double pointer hack. Use damon_for_each_region(), which is safe for this use case. And drop the work around in damos_skip_charged_region(). Link: https://lkml.kernel.org/r/20260227170623.95384-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 722dcb8fff7a..a97d8799c228 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1717,17 +1717,18 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_region *r, * This function checks if a given region should be skipped or not for the * reason. If only the starting part of the region has previously charged, * this function splits the region into two so that the second one covers the - * area that not charged in the previous charge widnow and saves the second - * region in *rp and returns false, so that the caller can apply DAMON action - * to the second one. + * area that not charged in the previous charge widnow, and return true. The + * caller can see the second one on the next iteration of the region walk. + * Note that this means the caller should use damon_for_each_region() instead + * of damon_for_each_region_safe(). If damon_for_each_region_safe() is used, + * the second region will just be ignored. * - * Return: true if the region should be entirely skipped, false otherwise. + * Return: true if the region should be skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s, + struct damon_region *r, struct damos *s, unsigned long min_region_sz) { - struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; unsigned long sz_to_skip; @@ -1754,8 +1755,7 @@ static bool damos_skip_charged_region(struct damon_target *t, sz_to_skip = min_region_sz; } damon_split_region_at(t, r, sz_to_skip); - r = damon_next_region(r); - *rp = r; + return true; } quota->charge_target_from = NULL; quota->charge_addr_from = 0; @@ -2014,7 +2014,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s, c->min_region_sz)) + if (damos_skip_charged_region(t, r, s, c->min_region_sz)) continue; if (s->max_nr_snapshots && @@ -2357,7 +2357,7 @@ static void damos_trace_stat(struct damon_ctx *c, struct damos *s) static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; - struct damon_region *r, *next_r; + struct damon_region *r; struct damos *s; unsigned long sample_interval = c->attrs.sample_interval ? c->attrs.sample_interval : 1; @@ -2383,7 +2383,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (c->ops.target_valid && c->ops.target_valid(t) == false) continue; - damon_for_each_region_safe(r, next_r, t) + damon_for_each_region(r, t) damon_do_apply_schemes(c, t, r); } From 22aa3321992eee0a39fb465e5083f5b8b5e7a82a Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 27 Feb 2026 20:05:01 +0530 Subject: [PATCH 123/369] khugepaged: remove redundant index check for pmd-folios Claim: folio_order(folio) == HPAGE_PMD_ORDER => folio->index == start. Proof: Both loops in hpage_collapse_scan_file and collapse_file, which iterate on the xarray, have the invariant that start <= folio->index < start + HPAGE_PMD_NR ... (i) A folio is always naturally aligned in the pagecache, therefore folio_order == HPAGE_PMD_ORDER => IS_ALIGNED(folio->index, HPAGE_PMD_NR) == true ... (ii) thp_vma_allowable_order -> thp_vma_suitable_order requires that the virtual offsets in the VMA are aligned to the order, => IS_ALIGNED(start, HPAGE_PMD_NR) == true ... (iii) Combining (i), (ii) and (iii), the claim is proven. Therefore, remove this check. While at it, simplify the comments. Link: https://lkml.kernel.org/r/20260227143501.1488110-1-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand (Arm) Reviewed-by: Lance Yang Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Anshuman Khandual Cc: Barry Song Cc: Liam Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 13b0fe50dfc5..ab97423fe837 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2023,9 +2023,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start) { - /* Maybe PMD-mapped */ + if (folio_order(folio) == HPAGE_PMD_ORDER) { result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } @@ -2353,15 +2351,11 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, continue; } - if (folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start) { - /* Maybe PMD-mapped */ + if (folio_order(folio) == HPAGE_PMD_ORDER) { result = SCAN_PTE_MAPPED_HUGEPAGE; /* - * For SCAN_PTE_MAPPED_HUGEPAGE, further processing - * by the caller won't touch the page cache, and so - * it's safe to skip LRU and refcount checks before - * returning. + * PMD-sized THP implies that we can only try + * retracting the PTE table. */ folio_put(folio); break; From 99573ef4ac30d4eae7a7937f0c9ea351991e3ccc Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 22:29:52 +0100 Subject: [PATCH 124/369] mm/pagewalk: drop FW_MIGRATION We removed the last user of FW_MIGRATION in commit 912aa825957f ("Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk""). So let's remove FW_MIGRATION and assign FW_ZEROPAGE bit 0. Including leafops.h is no longer required. While at it, convert "expose_page" to "zeropage", as zeropages are now the only remaining use case for not exposing a page. Link: https://lkml.kernel.org/r/20260227212952.190691-1-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 8 +------- mm/pagewalk.c | 40 ++++++++-------------------------------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 88e18615dd72..b41d7265c01b 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -148,14 +148,8 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, typedef int __bitwise folio_walk_flags_t; -/* - * Walk migration entries as well. Careful: a large folio might get split - * concurrently. - */ -#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) - /* Walk shared zeropages (small + huge) as well. */ -#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(0)) enum folio_walk_level { FW_LEVEL_PTE, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index a94c401ab2cf..cb358558807c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -5,7 +5,6 @@ #include #include #include -#include #include @@ -841,9 +840,6 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * - * As default, this function only considers present page table entries. - * If requested, it will also consider migration entries. - * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * @@ -854,11 +850,10 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, * that call. * * @fw->page will correspond to the page that is effectively referenced by - * @addr. However, for migration entries and shared zeropages @fw->page is - * set to NULL. Note that large folios might be mapped by multiple page table - * entries, and this function will always only lookup a single entry as - * specified by @addr, which might or might not cover more than a single page of - * the returned folio. + * @addr. However, for shared zeropages @fw->page is set to NULL. Note that + * large folios might be mapped by multiple page table entries, and this + * function will always only lookup a single entry as specified by @addr, which + * might or might not cover more than a single page of the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or @@ -885,7 +880,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, folio_walk_flags_t flags) { unsigned long entry_size; - bool expose_page = true; + bool zeropage = false; struct page *page; pud_t *pudp, pud; pmd_t *pmdp, pmd; @@ -933,10 +928,6 @@ struct folio *folio_walk_start(struct folio_walk *fw, if (page) goto found; } - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ spin_unlock(ptl); goto not_found; } @@ -970,16 +961,9 @@ pmd_table: } else if ((flags & FW_ZEROPAGE) && is_huge_zero_pmd(pmd)) { page = pfn_to_page(pmd_pfn(pmd)); - expose_page = false; + zeropage = true; goto found; } - } else if ((flags & FW_MIGRATION) && - pmd_is_migration_entry(pmd)) { - const softleaf_t entry = softleaf_from_pmd(pmd); - - page = softleaf_to_page(entry); - expose_page = false; - goto found; } spin_unlock(ptl); goto not_found; @@ -1004,15 +988,7 @@ pte_table: if ((flags & FW_ZEROPAGE) && is_zero_pfn(pte_pfn(pte))) { page = pfn_to_page(pte_pfn(pte)); - expose_page = false; - goto found; - } - } else if (!pte_none(pte)) { - const softleaf_t entry = softleaf_from_pte(pte); - - if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { - page = softleaf_to_page(entry); - expose_page = false; + zeropage = true; goto found; } } @@ -1021,7 +997,7 @@ not_found: vma_pgtable_walk_end(vma); return NULL; found: - if (expose_page) + if (!zeropage) /* Note: Offset from the mapped page, not the folio start. */ fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else From 4d267106ab013aa7f21aeab0a5272ac952f31d22 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 27 Feb 2026 06:12:04 +0000 Subject: [PATCH 125/369] mm/debug_vm_pgtable: replace WRITE_ONCE() with pxd_clear() Replace WRITE_ONCE() with generic pxd_clear() to clear out the page table entries as required. Besides this does not cause any functional change as well. Link: https://lkml.kernel.org/r/20260227061204.2215395-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: Ryan Roberts Acked-by: David Hildenbrand (Arm) Ackeed-by: SeongJae Park Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 83cf07269f13..23dc3ee09561 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -445,7 +445,7 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) * X86 defined pmd_set_huge() verifies that the given * PMD is not a populated non-leaf entry. */ - WRITE_ONCE(*args->pmdp, __pmd(0)); + pmd_clear(args->pmdp); WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); WARN_ON(!pmd_clear_huge(args->pmdp)); pmd = pmdp_get(args->pmdp); @@ -465,7 +465,7 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args) * X86 defined pud_set_huge() verifies that the given * PUD is not a populated non-leaf entry. */ - WRITE_ONCE(*args->pudp, __pud(0)); + pud_clear(args->pudp); WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); WARN_ON(!pud_clear_huge(args->pudp)); pud = pudp_get(args->pudp); From 3d56d7317b271a1a5030ebb135c58aedc4c0fd36 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 27 Feb 2026 04:03:00 +0000 Subject: [PATCH 126/369] mm: replace READ_ONCE() in pud_trans_unstable() Replace READ_ONCE() with the existing standard page table accessor for PUD aka pudp_get() in pud_trans_unstable(). This does not create any functional change for platforms that do not override pudp_get(), which still defaults to READ_ONCE(). Link: https://lkml.kernel.org/r/20260227040300.2091901-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand (Arm) Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: Mike Rapoport Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 776993d4567b..d2767a4c027b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -2004,7 +2004,7 @@ static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - pud_t pudval = READ_ONCE(*pud); + pud_t pudval = pudp_get(pud); if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; From 51d8c78be0c27ddb91bc2c0263941d8b30a47d3b Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Tue, 24 Feb 2026 18:53:16 +0530 Subject: [PATCH 127/369] mm/kasan: fix double free for kasan pXds kasan_free_pxd() assumes the page table is always struct page aligned. But that's not always the case for all architectures. E.g. In case of powerpc with 64K pagesize, PUD table (of size 4096) comes from slab cache named pgtable-2^9. Hence instead of page_to_virt(pxd_page()) let's just directly pass the start of the pxd table which is passed as the 1st argument. This fixes the below double free kasan issue seen with PMEM: radix-mmu: Mapped 0x0000047d10000000-0x0000047f90000000 with 2.00 MiB pages ================================================================== BUG: KASAN: double-free in kasan_remove_zero_shadow+0x9c4/0xa20 Free of addr c0000003c38e0000 by task ndctl/2164 CPU: 34 UID: 0 PID: 2164 Comm: ndctl Not tainted 6.19.0-rc1-00048-gea1013c15392 #157 VOLUNTARY Hardware name: IBM,9080-HEX POWER10 (architected) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_012) hv:phyp pSeries Call Trace: dump_stack_lvl+0x88/0xc4 (unreliable) print_report+0x214/0x63c kasan_report_invalid_free+0xe4/0x110 check_slab_allocation+0x100/0x150 kmem_cache_free+0x128/0x6e0 kasan_remove_zero_shadow+0x9c4/0xa20 memunmap_pages+0x2b8/0x5c0 devm_action_release+0x54/0x70 release_nodes+0xc8/0x1a0 devres_release_all+0xe0/0x140 device_unbind_cleanup+0x30/0x120 device_release_driver_internal+0x3e4/0x450 unbind_store+0xfc/0x110 drv_attr_store+0x78/0xb0 sysfs_kf_write+0x114/0x140 kernfs_fop_write_iter+0x264/0x3f0 vfs_write+0x3bc/0x7d0 ksys_write+0xa4/0x190 system_call_exception+0x190/0x480 system_call_vectored_common+0x15c/0x2ec ---- interrupt: 3000 at 0x7fff93b3d3f4 NIP: 00007fff93b3d3f4 LR: 00007fff93b3d3f4 CTR: 0000000000000000 REGS: c0000003f1b07e80 TRAP: 3000 Not tainted (6.19.0-rc1-00048-gea1013c15392) MSR: 800000000280f033 CR: 48888208 XER: 00000000 <...> NIP [00007fff93b3d3f4] 0x7fff93b3d3f4 LR [00007fff93b3d3f4] 0x7fff93b3d3f4 ---- interrupt: 3000 The buggy address belongs to the object at c0000003c38e0000 which belongs to the cache pgtable-2^9 of size 4096 The buggy address is located 0 bytes inside of 4096-byte region [c0000003c38e0000, c0000003c38e1000) The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x3c38c head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:c0000003bfd63e01 flags: 0x63ffff800000040(head|node=6|zone=0|lastcpupid=0x7ffff) page_type: f5(slab) raw: 063ffff800000040 c000000140058980 5deadbeef0000122 0000000000000000 raw: 0000000000000000 0000000080200020 00000000f5000000 c0000003bfd63e01 head: 063ffff800000040 c000000140058980 5deadbeef0000122 0000000000000000 head: 0000000000000000 0000000080200020 00000000f5000000 c0000003bfd63e01 head: 063ffff800000002 c00c000000f0e301 00000000ffffffff 00000000ffffffff head: ffffffffffffffff 0000000000000000 00000000ffffffff 0000000000000004 page dumped because: kasan: bad access detected [ 138.953636] [ T2164] Memory state around the buggy address: [ 138.953643] [ T2164] c0000003c38dff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 138.953652] [ T2164] c0000003c38dff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 138.953661] [ T2164] >c0000003c38e0000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 138.953669] [ T2164] ^ [ 138.953675] [ T2164] c0000003c38e0080: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 138.953684] [ T2164] c0000003c38e0100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 138.953692] [ T2164] ================================================================== [ 138.953701] [ T2164] Disabling lock debugging due to kernel taint Link: https://lkml.kernel.org/r/2f9135c7866c6e0d06e960993b8a5674a9ebc7ec.1771938394.git.ritesh.list@gmail.com Fixes: 0207df4fa1a8 ("kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN") Signed-off-by: Ritesh Harjani (IBM) Reported-by: Venkat Rao Bagalkote Reviewed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Ritesh Harjani (IBM)" Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- mm/kasan/init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index f084e7a5df1e..9c880f607c6a 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -292,7 +292,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) return; } - pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pte_free_kernel(&init_mm, pte_start); pmd_clear(pmd); } @@ -307,7 +307,7 @@ static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) return; } - pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pmd_free(&init_mm, pmd_start); pud_clear(pud); } @@ -322,7 +322,7 @@ static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) return; } - pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + pud_free(&init_mm, pud_start); p4d_clear(p4d); } @@ -337,7 +337,7 @@ static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) return; } - p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + p4d_free(&init_mm, p4d_start); pgd_clear(pgd); } From b1029f29eb1d5fbf07fa8db9b5e7ab6d9813ad67 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 28 Feb 2026 14:28:25 -0800 Subject: [PATCH 128/369] mm/damon/core: split regions for min_nr_regions Patch series "mm/damon: strictly respect min_nr_regions". DAMON core respects min_nr_regions only at merge operation. DAMON API callers are therefore responsible to respect or ignore that. Only vaddr ops is respecting that, but only for initial start time. DAMON sysfs interface allows users to setup the initial regions that DAMON core also respects. But, again, it works for only the initial time. Users setting the regions for min_nr_regions can be difficult and inefficient, when the min_nr_regions value is high. There was actually a report [1] from a user. The use case was page granular access monitoring with a large aggregation interval. Make the following three changes for resolving the issue. First (patch 1), make DAMON core split regions at the beginning and every aggregation interval, to respect the min_nr_regions. Second (patch 2), drop the vaddr's split operations and related code that are no more needed. Third (patch 3), add a kunit test for the newly introduced function. This patch (of 3): DAMON core layer respects the min_nr_regions parameter by setting the maximum size of each region as total monitoring region size divided by the parameter value. And the limit is applied by preventing merge of regions that result in a region larger than the maximum size. The limit is updated per ops update interval, because vaddr updates the monitoring regions on the ops update callback. It does nothing for the beginning state. That's because the users can set the initial monitoring regions as they want. That is, if the users really care about the min_nr_regions, they are supposed to set the initial monitoring regions to have more than min_nr_regions regions. The virtual address space operation set, vaddr, has an exceptional case. Users can ask the ops set to configure the initial regions on its own. For the case, vaddr sets up the initial regions to meet the min_nr_regions. So, vaddr has exceptional support, but basically users are required to set the regions on their own if they want min_nr_regions to be respected. When 'min_nr_regions' is high, such initial setup is difficult. If DAMON sysfs interface is used for that, the memory for saving the initial setup is also a waste. Even if the user forgives the setup, DAMON will eventually make more than min_nr_regions regions by splitting operations. But it will take time. If the aggregation interval is long, the delay could be problematic. There was actually a report [1] of the case. The reporter wanted to do page granular monitoring with a large aggregation interval. Also, DAMON is doing nothing for online changes on monitoring regions and min_nr_regions. For example, the user can remove a monitoring region or increase min_nr_regions while DAMON is running. Split regions larger than the size at the beginning of the kdamond main loop, to fix the initial setup issue. Also do the split every aggregation interval, for online changes. This means the behavior is slightly changed. It is difficult to imagine a use case that actually depends on the old behavior, though. So this change is arguably fine. Note that the size limit is aligned by damon_ctx->min_region_sz and cannot be zero. That is, if min_nr_region is larger than the total size of monitoring regions divided by ->min_region_sz, that cannot be respected. Link: https://lkml.kernel.org/r/20260228222831.7232-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260228222831.7232-2-sj@kernel.org Link: https://lore.kernel.org/CAC5umyjmJE9SBqjbetZZecpY54bHpn2AvCGNv3aF6J=1cfoPXQ@mail.gmail.com [1] Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/core.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index a97d8799c228..71ccea40368d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1316,6 +1316,40 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) return sz; } +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r); + +/* + * damon_apply_min_nr_regions() - Make effect of min_nr_regions parameter. + * @ctx: monitoring context. + * + * This function implement min_nr_regions (minimum number of damon_region + * objects in the given monitoring context) behavior. It first calculates + * maximum size of each region for enforcing the min_nr_regions as total size + * of the regions divided by the min_nr_regions. After that, this function + * splits regions to ensure all regions are equal to or smaller than the size + * limit. Finally, this function returns the maximum size limit. + * + * Returns: maximum size of each region for convincing min_nr_regions. + */ +static unsigned long damon_apply_min_nr_regions(struct damon_ctx *ctx) +{ + unsigned long max_region_sz = damon_region_sz_limit(ctx); + struct damon_target *t; + struct damon_region *r, *next; + + max_region_sz = ALIGN(max_region_sz, ctx->min_region_sz); + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) { + while (damon_sz_region(r) > max_region_sz) { + damon_split_region_at(t, r, max_region_sz); + r = damon_next_region(r); + } + } + } + return max_region_sz; +} + static int kdamond_fn(void *data); /* @@ -1672,9 +1706,6 @@ static void kdamond_tune_intervals(struct damon_ctx *c) damon_set_attrs(c, &new_attrs); } -static void damon_split_region_at(struct damon_target *t, - struct damon_region *r, unsigned long sz_r); - static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; @@ -2763,7 +2794,7 @@ static int kdamond_fn(void *data) if (!ctx->regions_score_histogram) goto done; - sz_limit = damon_region_sz_limit(ctx); + sz_limit = damon_apply_min_nr_regions(ctx); while (!kdamond_need_stop(ctx)) { /* @@ -2788,10 +2819,13 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (ctx->passed_sample_intervals >= next_aggregation_sis) + if (ctx->passed_sample_intervals >= next_aggregation_sis) { kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); + /* online updates might be made */ + sz_limit = damon_apply_min_nr_regions(ctx); + } /* * do kdamond_call() and kdamond_apply_schemes() after @@ -2850,7 +2884,6 @@ static int kdamond_fn(void *data) sample_interval; if (ctx->ops.update) ctx->ops.update(ctx); - sz_limit = damon_region_sz_limit(ctx); } } done: From 442d87c7db9e9e2a569a49d38f404b8b556b8719 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 28 Feb 2026 14:28:26 -0800 Subject: [PATCH 129/369] mm/damon/vaddr: do not split regions for min_nr_regions The previous commit made DAMON core split regions at the beginning for min_nr_regions. The virtual address space operation set (vaddr) does similar work on its own, for a case user delegates entire initial monitoring regions setup to vaddr. It is unnecessary now, as DAMON core will do similar work for any case. Remove the duplicated work in vaddr. Also, remove a helper function that was being used only for the work, and the test code of the helper function. Link: https://lkml.kernel.org/r/20260228222831.7232-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 76 ------------------------------------ mm/damon/vaddr.c | 70 +-------------------------------- 2 files changed, 2 insertions(+), 144 deletions(-) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index cfae870178bf..98e734d77d51 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -252,88 +252,12 @@ static void damon_test_apply_three_regions4(struct kunit *test) new_three_regions, expected, ARRAY_SIZE(expected)); } -static void damon_test_split_evenly_fail(struct kunit *test, - unsigned long start, unsigned long end, unsigned int nr_pieces) -{ - struct damon_target *t = damon_new_target(); - struct damon_region *r; - - if (!t) - kunit_skip(test, "target alloc fail"); - - r = damon_new_region(start, end); - if (!r) { - damon_free_target(t); - kunit_skip(test, "region alloc fail"); - } - - damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, - damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); - - damon_for_each_region(r, t) { - KUNIT_EXPECT_EQ(test, r->ar.start, start); - KUNIT_EXPECT_EQ(test, r->ar.end, end); - } - - damon_free_target(t); -} - -static void damon_test_split_evenly_succ(struct kunit *test, - unsigned long start, unsigned long end, unsigned int nr_pieces) -{ - struct damon_target *t = damon_new_target(); - struct damon_region *r; - unsigned long expected_width = (end - start) / nr_pieces; - unsigned long i = 0; - - if (!t) - kunit_skip(test, "target alloc fail"); - r = damon_new_region(start, end); - if (!r) { - damon_free_target(t); - kunit_skip(test, "region alloc fail"); - } - damon_add_region(r, t); - KUNIT_EXPECT_EQ(test, - damon_va_evenly_split_region(t, r, nr_pieces), 0); - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces); - - damon_for_each_region(r, t) { - if (i == nr_pieces - 1) { - KUNIT_EXPECT_EQ(test, - r->ar.start, start + i * expected_width); - KUNIT_EXPECT_EQ(test, r->ar.end, end); - break; - } - KUNIT_EXPECT_EQ(test, - r->ar.start, start + i++ * expected_width); - KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width); - } - damon_free_target(t); -} - -static void damon_test_split_evenly(struct kunit *test) -{ - KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), - -EINVAL); - - damon_test_split_evenly_fail(test, 0, 100, 0); - damon_test_split_evenly_succ(test, 0, 100, 10); - damon_test_split_evenly_succ(test, 5, 59, 5); - damon_test_split_evenly_succ(test, 4, 6, 1); - damon_test_split_evenly_succ(test, 0, 3, 2); - damon_test_split_evenly_fail(test, 5, 6, 2); -} - static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_three_regions_in_vmas), KUNIT_CASE(damon_test_apply_three_regions1), KUNIT_CASE(damon_test_apply_three_regions2), KUNIT_CASE(damon_test_apply_three_regions3), KUNIT_CASE(damon_test_apply_three_regions4), - KUNIT_CASE(damon_test_split_evenly), {}, }; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 4d6d8251d419..b069dbc7e3d2 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -53,52 +53,6 @@ static struct mm_struct *damon_get_mm(struct damon_target *t) return mm; } -/* - * Functions for the initial monitoring target regions construction - */ - -/* - * Size-evenly split a region into 'nr_pieces' small regions - * - * Returns 0 on success, or negative error code otherwise. - */ -static int damon_va_evenly_split_region(struct damon_target *t, - struct damon_region *r, unsigned int nr_pieces) -{ - unsigned long sz_orig, sz_piece, orig_end; - struct damon_region *n = NULL, *next; - unsigned long start; - unsigned int i; - - if (!r || !nr_pieces) - return -EINVAL; - - if (nr_pieces == 1) - return 0; - - orig_end = r->ar.end; - sz_orig = damon_sz_region(r); - sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION_SZ); - - if (!sz_piece) - return -EINVAL; - - r->ar.end = r->ar.start + sz_piece; - next = damon_next_region(r); - for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) { - n = damon_new_region(start, start + sz_piece); - if (!n) - return -ENOMEM; - damon_insert_region(n, r, next, t); - r = n; - } - /* complement last region for possible rounding error */ - if (n) - n->ar.end = orig_end; - - return 0; -} - static unsigned long sz_range(struct damon_addr_range *r) { return r->end - r->start; @@ -240,10 +194,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { struct damon_target *ti; - struct damon_region *r; struct damon_addr_range regions[3]; - unsigned long sz = 0, nr_pieces; - int i, tidx = 0; + int tidx = 0; if (damon_va_three_regions(t, regions)) { damon_for_each_target(ti, ctx) { @@ -255,25 +207,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, return; } - for (i = 0; i < 3; i++) - sz += regions[i].end - regions[i].start; - if (ctx->attrs.min_nr_regions) - sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION_SZ) - sz = DAMON_MIN_REGION_SZ; - - /* Set the initial three regions of the target */ - for (i = 0; i < 3; i++) { - r = damon_new_region(regions[i].start, regions[i].end); - if (!r) { - pr_err("%d'th init region creation failed\n", i); - return; - } - damon_add_region(r, t); - - nr_pieces = (regions[i].end - regions[i].start) / sz; - damon_va_evenly_split_region(t, r, nr_pieces); - } + damon_set_regions(t, regions, 3, DAMON_MIN_REGION_SZ); } /* Initialize '->regions_list' of every target (task) */ From ca6969e074dc1d2a3ac9f7e00de75769eb3cde64 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 28 Feb 2026 14:28:27 -0800 Subject: [PATCH 130/369] mm/damon/test/core-kunit: add damon_apply_min_nr_regions() test Add a kunit test for the functionality of damon_apply_min_nr_regions(). Link: https://lkml.kernel.org/r/20260228222831.7232-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 596f33ec2d81..fcc1336b234c 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -1239,6 +1239,57 @@ static void damon_test_set_filters_default_reject(struct kunit *test) damos_free_filter(target_filter); } +static void damon_test_apply_min_nr_regions_for(struct kunit *test, + unsigned long sz_regions, unsigned long min_region_sz, + unsigned long min_nr_regions, + unsigned long max_region_sz_expect, + unsigned long nr_regions_expect) +{ + struct damon_ctx *ctx; + struct damon_target *t; + struct damon_region *r; + unsigned long max_region_size; + + ctx = damon_new_ctx(); + if (!ctx) + kunit_skip(test, "ctx alloc fail\n"); + t = damon_new_target(); + if (!t) { + damon_destroy_ctx(ctx); + kunit_skip(test, "target alloc fail\n"); + } + damon_add_target(ctx, t); + r = damon_new_region(0, sz_regions); + if (!r) { + damon_destroy_ctx(ctx); + kunit_skip(test, "region alloc fail\n"); + } + damon_add_region(r, t); + + ctx->min_region_sz = min_region_sz; + ctx->attrs.min_nr_regions = min_nr_regions; + max_region_size = damon_apply_min_nr_regions(ctx); + + KUNIT_EXPECT_EQ(test, max_region_size, max_region_sz_expect); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_regions_expect); + + damon_destroy_ctx(ctx); +} + +static void damon_test_apply_min_nr_regions(struct kunit *test) +{ + /* common, expected setup */ + damon_test_apply_min_nr_regions_for(test, 10, 1, 10, 1, 10); + /* no zero size limit */ + damon_test_apply_min_nr_regions_for(test, 10, 1, 15, 1, 10); + /* max size should be aligned by min_region_sz */ + damon_test_apply_min_nr_regions_for(test, 10, 2, 2, 6, 2); + /* + * when min_nr_regions and min_region_sz conflicts, min_region_sz wins. + */ + damon_test_apply_min_nr_regions_for(test, 10, 2, 10, 2, 5); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -1265,6 +1316,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), + KUNIT_CASE(damon_test_apply_min_nr_regions), {}, }; From a373f371166df56eb3ec043d72dafc70a7d46536 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 27 Feb 2026 18:07:58 +0100 Subject: [PATCH 131/369] mm/page_alloc: effectively disable pcp with CONFIG_SMP=n Patch series "mm/page_alloc: pcp locking cleanup". This is a followup to the hotfix 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n"), to simplify the code and deal with the original issue properly. The previous RFC attempt [1] argued for changing the UP spinlock implementation, which was discouraged, but thanks to David's off-list suggestion, we can achieve the goal without changing the spinlock implementation. The main change in Patch 1 relies on the fact that on UP we don't need the pcp lists for scalability, so just make them always bypassed during alloc/free by making the pcp trylock an unconditional failure. The various drain paths that use pcp_spin_lock_maybe_irqsave() continue to exist but will never do any work in practice. In Patch 2 we can again remove the irq saving from them that commit 038a102535eb added. Besides simpler code with all the ugly UP_flags removed, we get less bloat with CONFIG_SMP=n for mm/page_alloc.o as a result: add/remove: 25/28 grow/shrink: 4/5 up/down: 2105/-6665 (-4560) Function old new delta get_page_from_freelist 5689 7248 +1559 free_unref_folios 2006 2324 +318 make_alloc_exact 270 286 +16 __zone_watermark_ok 306 322 +16 drain_pages_zone.isra 119 109 -10 decay_pcp_high 181 149 -32 setup_pcp_cacheinfo 193 147 -46 __free_frozen_pages 1339 1089 -250 alloc_pages_bulk_noprof 1054 419 -635 free_frozen_page_commit 907 - -907 try_to_claim_block 1975 - -1975 __rmqueue_pcplist 2614 - -2614 Total: Before=54624, After=50064, chg -8.35% This patch (of 3): The page allocator has been using a locking scheme for its percpu page caches (pcp) based on spin_trylock() with no _irqsave() part. The trick is that if we interrupt the locked section, we fail the trylock and just fallback to the slowpath taking the zone lock. That's more expensive, but rare, so we don't need to pay the irqsave/restore cost all the time in the fastpaths. It's similar to but not exactly local_trylock_t (which is also newer anyway) because in some cases we do lock the pcp of a non-local cpu to drain it, in a way that's cheaper than using IPI or queue_work_on(). The complication of this scheme has been UP non-debug spinlock implementation which assumes spin_trylock() can't fail on UP and has no state to track whether it's locked. It just doesn't anticipate this usage scenario. So to work around that we disable IRQs only on UP, complicating the implementation. Also recently we found years old bug in where we didn't disable IRQs in related paths - see 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n"). We can avoid this UP complication by realizing that we do not need the pcp caching for scalability on UP in the first place. Removing it completely with #ifdefs is not worth the trouble either. Just make pcp_spin_trylock() return NULL unconditionally on CONFIG_SMP=n. This makes the slowpaths unconditional, and we can remove the IRQ save/restore handling in pcp_spin_trylock()/unlock() completely. Link: https://lkml.kernel.org/r/20260227-b4-pcp-locking-cleanup-v1-0-f7e22e603447@kernel.org Link: https://lkml.kernel.org/r/20260227-b4-pcp-locking-cleanup-v1-1-f7e22e603447@kernel.org Link: https://lore.kernel.org/all/d762c46b-36f0-471a-b5b4-23c8cf5628ae@suse.cz/ [1] Signed-off-by: Vlastimil Babka (SUSE) Suggested-by: David Hildenbrand (Arm) Acked-by: Johannes Weiner Cc: Brendan Jackman Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 92 ++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 58 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f4f9a98bb425..7fa2d0f10460 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -94,23 +94,6 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) -/* - * On SMP, spin_trylock is sufficient protection. - * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. - * Pass flags to a no-op inline function to typecheck and silence the unused - * variable warning. - */ -static inline void __pcp_trylock_noop(unsigned long *flags) { } -#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags)) -#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags)) -#else - -/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ -#define pcp_trylock_prepare(flags) local_irq_save(flags) -#define pcp_trylock_finish(flags) local_irq_restore(flags) -#endif - /* * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid * a migration causing the wrong PCP to be locked and remote memory being @@ -149,31 +132,28 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { } pcpu_task_unpin(); \ }) -/* struct per_cpu_pages specific helpers. */ -#define pcp_spin_trylock(ptr, UP_flags) \ -({ \ - struct per_cpu_pages *__ret; \ - pcp_trylock_prepare(UP_flags); \ - __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \ - if (!__ret) \ - pcp_trylock_finish(UP_flags); \ - __ret; \ -}) +/* struct per_cpu_pages specific helpers.*/ +#ifdef CONFIG_SMP +#define pcp_spin_trylock(ptr) \ + pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) -#define pcp_spin_unlock(ptr, UP_flags) \ -({ \ - pcpu_spin_unlock(lock, ptr); \ - pcp_trylock_finish(UP_flags); \ -}) +#define pcp_spin_unlock(ptr) \ + pcpu_spin_unlock(lock, ptr) /* - * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e. - * a potentially remote cpu drain) and get interrupted by an operation that - * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP - * spinlock assumptions making the trylock a no-op. So we have to turn that - * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no - * remote cpu's so we can only be locking the only existing local one. + * On CONFIG_SMP=n the UP implementation of spin_trylock() never fails and thus + * is not compatible with our locking scheme. However we do not need pcp for + * scalability in the first place, so just make all the trylocks fail and take + * the slow path unconditionally. */ +#else +#define pcp_spin_trylock(ptr) \ + NULL + +#define pcp_spin_unlock(ptr) \ + BUG_ON(1) +#endif + #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline void __flags_noop(unsigned long *flags) { } #define pcp_spin_lock_maybe_irqsave(ptr, flags) \ @@ -2858,7 +2838,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, */ static bool free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags) + unsigned int order, fpi_t fpi_flags) { int high, batch; int to_free, to_free_batched; @@ -2918,9 +2898,9 @@ static bool free_frozen_page_commit(struct zone *zone, if (to_free == 0 || pcp->count == 0) break; - pcp_spin_unlock(pcp, *UP_flags); + pcp_spin_unlock(pcp); - pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) { ret = false; break; @@ -2932,7 +2912,7 @@ static bool free_frozen_page_commit(struct zone *zone, * returned in an unlocked state. */ if (smp_processor_id() != cpu) { - pcp_spin_unlock(pcp, *UP_flags); + pcp_spin_unlock(pcp); ret = false; break; } @@ -2964,7 +2944,6 @@ static bool free_frozen_page_commit(struct zone *zone, static void __free_frozen_pages(struct page *page, unsigned int order, fpi_t fpi_flags) { - unsigned long UP_flags; struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); @@ -3000,12 +2979,12 @@ static void __free_frozen_pages(struct page *page, unsigned int order, add_page_to_zone_llist(zone, page, order); return; } - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { if (!free_frozen_page_commit(zone, pcp, page, migratetype, - order, fpi_flags, &UP_flags)) + order, fpi_flags)) return; - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, fpi_flags); } @@ -3026,7 +3005,6 @@ void free_frozen_pages_nolock(struct page *page, unsigned int order) */ void free_unref_folios(struct folio_batch *folios) { - unsigned long UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int i, j; @@ -3069,7 +3047,7 @@ void free_unref_folios(struct folio_batch *folios) if (zone != locked_zone || is_migrate_isolate(migratetype)) { if (pcp) { - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); locked_zone = NULL; pcp = NULL; } @@ -3088,7 +3066,7 @@ void free_unref_folios(struct folio_batch *folios) * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { free_one_page(zone, &folio->page, pfn, order, FPI_NONE); @@ -3106,14 +3084,14 @@ void free_unref_folios(struct folio_batch *folios) trace_mm_page_free_batched(&folio->page); if (!free_frozen_page_commit(zone, pcp, &folio->page, - migratetype, order, FPI_NONE, &UP_flags)) { + migratetype, order, FPI_NONE)) { pcp = NULL; locked_zone = NULL; } } if (pcp) - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); folio_batch_reinit(folios); } @@ -3371,10 +3349,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long UP_flags; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) return NULL; @@ -3386,7 +3363,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -5067,7 +5044,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array) { struct page *page; - unsigned long UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5161,7 +5137,7 @@ retry_this_zone: goto failed; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) goto failed; @@ -5180,7 +5156,7 @@ retry_this_zone: if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); goto failed; } break; @@ -5192,7 +5168,7 @@ retry_this_zone: page_array[nr_populated++] = page; } - pcp_spin_unlock(pcp, UP_flags); + pcp_spin_unlock(pcp); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); From 0a2c52a9a2f55e80fee6a10a846cadab590dc918 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 27 Feb 2026 18:07:59 +0100 Subject: [PATCH 132/369] mm/page_alloc: remove IRQ saving/restoring from pcp locking Effectively revert commit 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n"). The original problem is now avoided by pcp_spin_trylock() always failing on CONFIG_SMP=n, so we do not need to disable IRQs anymore. It's not a complete revert, because keeping the pcp_spin_(un)lock() wrappers is useful. Rename them from _maybe_irqsave/restore to _nopin. The difference from pcp_spin_trylock()/pcp_spin_unlock() is that the _nopin variants don't perform pcpu_task_pin/unpin(). Link: https://lkml.kernel.org/r/20260227-b4-pcp-locking-cleanup-v1-2-f7e22e603447@kernel.org Signed-off-by: Vlastimil Babka (SUSE) Acked-by: Johannes Weiner Cc: Brendan Jackman Cc: David Hildenbrand (Arm) Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 46 ++++++++++++++++------------------------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7fa2d0f10460..be367516c59b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -154,24 +154,14 @@ static DEFINE_MUTEX(pcp_batch_high_lock); BUG_ON(1) #endif -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) -static inline void __flags_noop(unsigned long *flags) { } -#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ -({ \ - __flags_noop(&(flags)); \ - spin_lock(&(ptr)->lock); \ -}) -#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ -({ \ - spin_unlock(&(ptr)->lock); \ - __flags_noop(&(flags)); \ -}) -#else -#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ - spin_lock_irqsave(&(ptr)->lock, flags) -#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ - spin_unlock_irqrestore(&(ptr)->lock, flags) -#endif +/* + * In some cases we do not need to pin the task to the CPU because we are + * already given a specific cpu's pcp pointer. + */ +#define pcp_spin_lock_nopin(ptr) \ + spin_lock(&(ptr)->lock) +#define pcp_spin_unlock_nopin(ptr) \ + spin_unlock(&(ptr)->lock) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -2568,7 +2558,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, to_drain_batched, batch; - unsigned long UP_flags; bool todo = false; high_min = READ_ONCE(pcp->high_min); @@ -2588,9 +2577,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->count - pcp->high; while (to_drain > 0) { to_drain_batched = min(to_drain, batch); - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); free_pcppages_bulk(zone, to_drain_batched, pcp, 0); - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); todo = true; to_drain -= to_drain_batched; @@ -2607,15 +2596,14 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long UP_flags; int to_drain, batch; batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); free_pcppages_bulk(zone, to_drain, pcp, 0); - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); } } #endif @@ -2626,11 +2614,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - unsigned long UP_flags; int count; do { - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); count = pcp->count; if (count) { int to_drain = min(count, @@ -2639,7 +2626,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) free_pcppages_bulk(zone, to_drain, pcp, 0); count -= to_drain; } - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); } while (count); } @@ -6123,7 +6110,6 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; - unsigned long UP_flags; pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); cci = get_cpu_cacheinfo(cpu); @@ -6134,12 +6120,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) * This can reduce zone lock contention without hurting * cache-hot pages sharing. */ - pcp_spin_lock_maybe_irqsave(pcp, UP_flags); + pcp_spin_lock_nopin(pcp); if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) pcp->flags |= PCPF_FREE_HIGH_BATCH; else pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); + pcp_spin_unlock_nopin(pcp); } void setup_pcp_cacheinfo(unsigned int cpu) From e9c01915ae2b2bc9f02dbe994aca826a1021f0a2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 27 Feb 2026 18:08:00 +0100 Subject: [PATCH 133/369] mm/page_alloc: remove pcpu_spin_* wrappers We only ever use pcpu_spin_trylock()/unlock() with struct per_cpu_pages so refactor the helpers to remove the generic layer. No functional change intended. Link: https://lkml.kernel.org/r/20260227-b4-pcp-locking-cleanup-v1-3-f7e22e603447@kernel.org Signed-off-by: Vlastimil Babka (SUSE) Suggested-by: Matthew Wilcox Acked-by: Johannes Weiner Cc: Brendan Jackman Cc: David Hildenbrand (Arm) Cc: Mel Gorman Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index be367516c59b..f11f38ba2e12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -111,35 +111,29 @@ static DEFINE_MUTEX(pcp_batch_high_lock); #endif /* - * Generic helper to lookup and a per-cpu variable with an embedded spinlock. - * Return value should be used with equivalent unlock helper. + * A helper to lookup and trylock pcp with embedded spinlock. + * The return value should be used with the unlock helper. + * NULL return value means the trylock failed. */ -#define pcpu_spin_trylock(type, member, ptr) \ +#ifdef CONFIG_SMP +#define pcp_spin_trylock(ptr) \ ({ \ - type *_ret; \ + struct per_cpu_pages *_ret; \ pcpu_task_pin(); \ _ret = this_cpu_ptr(ptr); \ - if (!spin_trylock(&_ret->member)) { \ + if (!spin_trylock(&_ret->lock)) { \ pcpu_task_unpin(); \ _ret = NULL; \ } \ _ret; \ }) -#define pcpu_spin_unlock(member, ptr) \ +#define pcp_spin_unlock(ptr) \ ({ \ - spin_unlock(&ptr->member); \ + spin_unlock(&ptr->lock); \ pcpu_task_unpin(); \ }) -/* struct per_cpu_pages specific helpers.*/ -#ifdef CONFIG_SMP -#define pcp_spin_trylock(ptr) \ - pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) - -#define pcp_spin_unlock(ptr) \ - pcpu_spin_unlock(lock, ptr) - /* * On CONFIG_SMP=n the UP implementation of spin_trylock() never fails and thus * is not compatible with our locking scheme. However we do not need pcp for From 28266ac94a50e585c267a79d9ef5c2803d4dcd7a Mon Sep 17 00:00:00 2001 From: Gladyshev Ilya Date: Sun, 1 Mar 2026 13:19:39 +0000 Subject: [PATCH 134/369] mm: make ref_unless functions unless_zero only There are no users of (folio/page)_ref_add_unless(page, nr, u) with u != 0 [1] and all current users are "internal" for page refcounting API. This allows us to safely drop this parameter and reduce function semantics to the "unless zero" cases only. If needed, these functions for the u!=0 cases can be trivially reintroduced later using the same atomic_add_unless operations as before. [1]: The last user was dropped in v5.18 kernel, commit 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount"). There is no trace of discussion as to why this cleanup wasn't done earlier. Link: https://lkml.kernel.org/r/a0c89b49d38c671a0bdd35069d15ee13e08314d2.1772370066.git.gladyshev.ilya1@h-partners.com Co-developed-by: Gorbunov Ivan Signed-off-by: Gorbunov Ivan Signed-off-by: Gladyshev Ilya Acked-by: David Hildenbrand (Arm) Acked-by: Kiryl Shutsemau Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/page_ref.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 19619e5efeba..08b743aab92a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1506,7 +1506,7 @@ static inline int folio_put_testzero(struct folio *folio) */ static inline bool get_page_unless_zero(struct page *page) { - return page_ref_add_unless(page, 1, 0); + return page_ref_add_unless_zero(page, 1); } static inline struct folio *folio_get_nontail_page(struct page *page) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 490d0ad6e56d..94d3f0e71c06 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -228,18 +228,18 @@ static inline int folio_ref_dec_return(struct folio *folio) return page_ref_dec_return(&folio->page); } -static inline bool page_ref_add_unless(struct page *page, int nr, int u) +static inline bool page_ref_add_unless_zero(struct page *page, int nr) { - bool ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = atomic_add_unless(&page->_refcount, nr, 0); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } -static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) +static inline bool folio_ref_add_unless_zero(struct folio *folio, int nr) { - return page_ref_add_unless(&folio->page, nr, u); + return page_ref_add_unless_zero(&folio->page, nr); } /** @@ -255,12 +255,12 @@ static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) */ static inline bool folio_try_get(struct folio *folio) { - return folio_ref_add_unless(folio, 1, 0); + return folio_ref_add_unless_zero(folio, 1); } static inline bool folio_ref_try_add(struct folio *folio, int count) { - return folio_ref_add_unless(folio, count, 0); + return folio_ref_add_unless_zero(folio, count); } static inline int page_ref_freeze(struct page *page, int count) From 7a197d346a44384a1a858a98ef03766840e561d4 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 2 Mar 2026 13:10:15 -0700 Subject: [PATCH 135/369] Documentation: fix a hugetlbfs reservation statement Documentation/mm/hugetlbfs_reserv.rst has if (resv_needed <= (resv_huge_pages - free_huge_pages)) resv_huge_pages += resv_needed; which describes this code in gather_surplus_pages() needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; return 0; } which means if there are enough free hugepages to account for the new reservation, simply update the global reservation count without further action. But the description is backwards, it should be if (resv_needed <= (free_huge_pages - resv_huge_pages)) instead. Link: https://lkml.kernel.org/r/20260302201015.1824798-1-jane.chu@oracle.com Fixes: 70bc0dc578b3 ("Documentation: vm, add hugetlbfs reservation overview") Signed-off-by: Jane Chu Cc: David Hildenbrand Cc: Hillf Danton Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Oscar Salvador Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/hugetlbfs_reserv.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst index 4914fbf07966..a49115db18c7 100644 --- a/Documentation/mm/hugetlbfs_reserv.rst +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -155,7 +155,7 @@ are enough free huge pages to accommodate the reservation. If there are, the global reservation count resv_huge_pages is adjusted something like the following:: - if (resv_needed <= (resv_huge_pages - free_huge_pages)) + if (resv_needed <= (free_huge_pages - resv_huge_pages) resv_huge_pages += resv_needed; Note that the global lock hugetlb_lock is held when checking and adjusting From 0edd78cd4d40a752dc6d1bc661ce297c40baea29 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 2 Mar 2026 12:47:39 +0100 Subject: [PATCH 136/369] mm/vmalloc: fix incorrect size reporting on allocation failure When __vmalloc_area_node() fails to allocate pages, the failure message may report an incorrect allocation size, for example: vmalloc error: size 0, failed to allocate pages, ... This happens because the warning prints area->nr_pages * PAGE_SIZE. At this point, area->nr_pages may be zero or partly populated thus it is not valid. Report the originally requested allocation size instead by using nr_small_pages * PAGE_SIZE, which reflects the actual number of pages being requested by user. Link: https://lkml.kernel.org/r/20260302114740.2668450-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Mikulas Patocka Reviewed-by: Vishal Moola (Oracle) Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b2c2ed650840..634d8c782cca 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3894,7 +3894,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocate pages", - area->nr_pages * PAGE_SIZE); + nr_small_pages * PAGE_SIZE); goto fail; } From 3caedb3b99eabe9f67b7b6c704ab8a92fe35dcec Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 2 Mar 2026 12:47:40 +0100 Subject: [PATCH 137/369] vmalloc: support __GFP_RETRY_MAYFAIL and __GFP_NORETRY __GFP_RETRY_MAYFAIL and __GFP_NORETRY haven't been supported so far because their semantic (i.e. to not trigger OOM killer) is not possible with the existing vmalloc page table allocation which is allowing for the OOM killer. Example: __vmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); vmalloc_test/55 invoked oom-killer: gfp_mask=0x40dc0( GFP_KERNEL|__GFP_ZERO|__GFP_COMP), order=0, oom_score_adj=0 active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:0 isolated_file:0 unevictable:0 dirty:0 writeback:0 slab_reclaimable:700 slab_unreclaimable:33708 mapped:0 shmem:0 pagetables:5174 sec_pagetables:0 bounce:0 kernel_misc_reclaimable:0 free:850 free_pcp:319 free_cma:0 CPU: 4 UID: 0 PID: 639 Comm: vmalloc_test/55 ... Hardware name: QEMU Standard PC (i440FX + PIIX, ... Call Trace: dump_stack_lvl+0x5d/0x80 dump_header+0x43/0x1b3 out_of_memory.cold+0x8/0x78 __alloc_pages_slowpath.constprop.0+0xef5/0x1130 __alloc_frozen_pages_noprof+0x312/0x330 alloc_pages_mpol+0x7d/0x160 alloc_pages_noprof+0x50/0xa0 __pte_alloc_kernel+0x1e/0x1f0 ... There are usecases for these modifiers when a large allocation request should rather fail than trigger OOM killer which wouldn't be able to handle the situation anyway [1]. While we cannot change existing page table allocation code easily we can piggy back on scoped NOWAIT allocation for them that we already have in place. The rationale is that the bulk of the consumed memory is sitting in pages backing the vmalloc allocation. Page tables are only participating a tiny fraction. Moreover page tables for virtually allocated areas are never reclaimed so the longer the system runs to less likely they are. It makes sense to allow an approximation of __GFP_RETRY_MAYFAIL and __GFP_NORETRY even if the page table allocation part is much weaker. This doesn't break the failure mode while it allows for the no OOM semantic. [1] https://lore.kernel.org/all/32bd9bed-a939-69c4-696d-f7f9a5fe31d8@redhat.com/T/#u Link: https://lkml.kernel.org/r/20260302114740.2668450-2-urezki@gmail.com Signed-off-by: Michal Hocko Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Mikulas Patocka Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 634d8c782cca..c607307c657a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3797,6 +3797,8 @@ static void defer_vm_area_cleanup(struct vm_struct *area) * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() * GFP_NOFS - memalloc_nofs_save() * GFP_NOIO - memalloc_noio_save() + * __GFP_RETRY_MAYFAIL, __GFP_NORETRY - memalloc_noreclaim_save() + * to prevent OOMs * * Returns a flag cookie to pair with restore. */ @@ -3805,7 +3807,8 @@ memalloc_apply_gfp_scope(gfp_t gfp_mask) { unsigned int flags = 0; - if (!gfpflags_allow_blocking(gfp_mask)) + if (!gfpflags_allow_blocking(gfp_mask) || + (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_NORETRY))) flags = memalloc_noreclaim_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); @@ -3933,7 +3936,8 @@ fail: * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP. */ #define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\ - __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\ + __GFP_NOFAIL | __GFP_ZERO |\ + __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ GFP_USER | __GFP_NOLOCKDEP) @@ -3964,12 +3968,15 @@ static gfp_t vmalloc_fix_flags(gfp_t flags) * virtual range with protection @prot. * * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT, - * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported. + * %__GFP_RETRY_MAYFAIL, %__GFP_NORETRY, %GFP_NOFS and %GFP_NOIO. + * Zone modifiers are not supported. * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only * by __vmalloc(). * - * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY - * and %__GFP_RETRY_MAYFAIL are not supported. + * Retry modifiers: only %__GFP_NOFAIL is fully supported; + * %__GFP_NORETRY and %__GFP_RETRY_MAYFAIL are supported with limitation, + * i.e. page tables are allocated with NOWAIT semantic so they might fail + * under moderate memory pressure. * * %__GFP_NOWARN can be used to suppress failure messages. * From 9de209c183de3a52220b37262e2a0e0d52719de5 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Tue, 3 Mar 2026 13:04:16 +0100 Subject: [PATCH 138/369] kasan: docs: SLUB is the only remaining slab implementation We have only the SLUB implementation left in the kernel (referred to as "slab"). Therefore, there is nothing special regarding KASAN modes when it comes to the slab allocator anymore. Drop the stale comment regarding differing SLUB vs. SLAB support. Link: https://lkml.kernel.org/r/20260303120416.62580-1-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Vlastimil Babka (SUSE) Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index a034700da7c4..4968b2aa60c8 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -75,9 +75,6 @@ Software Tag-Based KASAN supports slab, page_alloc, vmalloc, and stack memory. Hardware Tag-Based KASAN supports slab, page_alloc, and non-executable vmalloc memory. -For slab, both software KASAN modes support SLUB and SLAB allocators, while -Hardware Tag-Based KASAN only supports SLUB. - Usage ----- From c48ad5a4b8fef0f9c85d3e0531be4968860d3792 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:32 +0100 Subject: [PATCH 139/369] mm/madvise: drop range checks in madvise_free_single_vma() Patch series "mm: cleanups around unmapping / zapping". A bunch of cleanups around unmapping and zapping. Mostly simplifications, code movements, documentation and renaming of zapping functions. With this series, we'll have the following high-level zap/unmap functions (excluding high-level folio zapping): * unmap_vmas() for actual unmapping (vmas will go away) * zap_vma(): zap all page table entries in a vma * zap_vma_for_reaping(): zap_vma() that must not block * zap_vma_range(): zap a range of page table entries * zap_vma_range_batched(): zap_vma_range() with more options and batching * zap_special_vma_range(): limited zap_vma_range() for modules * __zap_vma_range(): internal helper Patch #1 is not about unmapping/zapping, but I stumbled over it while verifying MADV_DONTNEED range handling. Patch #16 is related to [1], but makes sense even independent of that. This patch (of 16): madvise_vma_behavior()-> madvise_dontneed_free()->madvise_free_single_vma() is only called from madvise_walk_vmas() (a) After try_vma_read_lock() confirmed that the whole range falls into a single VMA (see is_vma_lock_sufficient()). (b) After adjusting the range to the VMA in the loop afterwards. madvise_dontneed_free() might drop the MM lock when handling userfaultfd, but it properly looks up the VMA again to adjust the range. So in madvise_free_single_vma(), the given range should always fall into a single VMA and should also span at least one page. Let's drop the error checks. The code now matches what we do in madvise_dontneed_single_vma(), where we call zap_vma_range_batched() that documents: "The range must fit into one VMA.". Although that function still adjusts that range, we'll change that soon. Link: https://lkml.kernel.org/r/20260227200848.114019-1-david@kernel.org Link: https://lkml.kernel.org/r/20260227200848.114019-2-david@kernel.org Link: https://lore.kernel.org/r/aYSKyr7StGpGKNqW@google.com [1] Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/madvise.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index dbb69400786d..1313166c5514 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -799,9 +799,10 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) { struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma = madv_behavior->vma; - unsigned long start_addr = madv_behavior->range.start; - unsigned long end_addr = madv_behavior->range.end; - struct mmu_notifier_range range; + struct mmu_notifier_range range = { + .start = madv_behavior->range.start, + .end = madv_behavior->range.end, + }; struct mmu_gather *tlb = madv_behavior->tlb; struct mm_walk_ops walk_ops = { .pmd_entry = madvise_free_pte_range, @@ -811,12 +812,6 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) if (!vma_is_anonymous(vma)) return -EINVAL; - range.start = max(vma->vm_start, start_addr); - if (range.start >= vma->vm_end) - return -EINVAL; - range.end = min(vma->vm_end, end_addr); - if (range.end <= vma->vm_start) - return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); From de008c9ba5684f14e83bcf86cd45fb0e4e6c4d82 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:33 +0100 Subject: [PATCH 140/369] mm/memory: remove "zap_details" parameter from zap_page_range_single() Nobody except memory.c should really set that parameter to non-NULL. So let's just drop it and make unmap_mapping_range_vma() use zap_page_range_single_batched() instead. [david@kernel.org: format on a single line] Link: https://lkml.kernel.org/r/8a27e9ac-2025-4724-a46d-0a7c90894ba7@kernel.org Link: https://lkml.kernel.org/r/20260227200848.114019-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 2 +- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 5 ++--- kernel/bpf/arena.c | 3 +-- kernel/events/core.c | 2 +- mm/madvise.c | 3 +-- mm/memory.c | 16 ++++++++++------ net/ipv4/tcp.c | 5 ++--- rust/kernel/mm/virt.rs | 4 +--- 9 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index dea83e3103e5..ae2d59a19313 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -89,7 +89,7 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo if (!vma) return; if (!is_vm_hugetlb_page(vma)) - zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); + zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr); vmaddr = vma->vm_end; } } diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 241f16a9b63d..dd2046bd5cde 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1185,7 +1185,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range_single(vma, page_addr, PAGE_SIZE, NULL); + zap_page_range_single(vma, page_addr, PAGE_SIZE); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 08b743aab92a..6512d70c5852 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,11 +2804,10 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details); + unsigned long size); static inline void zap_vma_pages(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, - vma->vm_end - vma->vm_start, NULL); + zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index f355cf1c1a16..19cca936eb9d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -656,8 +656,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, - PAGE_SIZE * page_cnt, NULL); + zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) diff --git a/kernel/events/core.c b/kernel/events/core.c index 89b40e439717..2ecdaabf1b4d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7213,7 +7213,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) - zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE); #endif return err; diff --git a/mm/madvise.c b/mm/madvise.c index 1313166c5514..e4a2728593a8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1193,8 +1193,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, range->start, - range->end - range->start, NULL); + zap_page_range_single(vma, range->start, range->end - range->start); } /* diff --git a/mm/memory.c b/mm/memory.c index f78ab3869f8d..fbd02d5bd520 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2203,17 +2203,16 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap - * @details: details of shared cache invalidation * * The range must fit into one VMA. */ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details) + unsigned long size) { struct mmu_gather tlb; tlb_gather_mmu(&tlb, vma->vm_mm); - zap_page_range_single_batched(&tlb, vma, address, size, details); + zap_page_range_single_batched(&tlb, vma, address, size, NULL); tlb_finish_mmu(&tlb); } @@ -2235,7 +2234,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, !(vma->vm_flags & VM_PFNMAP)) return; - zap_page_range_single(vma, address, size, NULL); + zap_page_range_single(vma, address, size); } EXPORT_SYMBOL_GPL(zap_vma_ptes); @@ -3003,7 +3002,7 @@ static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long add * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ - zap_page_range_single(vma, addr, size, NULL); + zap_page_range_single(vma, addr, size); return error; } @@ -4226,7 +4225,12 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { - zap_page_range_single(vma, start_addr, end_addr - start_addr, details); + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, start_addr, + end_addr - start_addr, details); + tlb_finish_mmu(&tlb); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 202a4e57a218..89c962672e51 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,7 +2105,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range_single(vma, *address, maybe_zap_len, NULL); + zap_page_range_single(vma, *address, maybe_zap_len); err = 0; } @@ -2270,8 +2270,7 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range_single(vma, address, total_bytes_to_map, - NULL); + zap_page_range_single(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index da21d65ccd20..6bfd91cfa1f4 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -123,9 +123,7 @@ impl VmaRef { // SAFETY: By the type invariants, the caller has read access to this VMA, which is // sufficient for this method call. This method has no requirements on the vma flags. The // address range is checked to be within the vma. - unsafe { - bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut()) - }; + unsafe { bindings::zap_page_range_single(self.as_ptr(), address, size) }; } /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise From 75c5ae05e3d98d2cb4eeef40bf1467f2edc14bd2 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:34 +0100 Subject: [PATCH 141/369] mm/memory: inline unmap_mapping_range_vma() into unmap_mapping_range_tree() Let's remove the number of unmap-related functions that cause confusion by inlining unmap_mapping_range_vma() into its single caller. The end result looks pretty readable. Link: https://lkml.kernel.org/r/20260227200848.114019-4-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/memory.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index fbd02d5bd520..f1c5d6b01a62 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4221,18 +4221,6 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return wp_page_copy(vmf); } -static void unmap_mapping_range_vma(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) -{ - struct mmu_gather tlb; - - tlb_gather_mmu(&tlb, vma->vm_mm); - zap_page_range_single_batched(&tlb, vma, start_addr, - end_addr - start_addr, details); - tlb_finish_mmu(&tlb); -} - static inline void unmap_mapping_range_tree(struct rb_root_cached *root, pgoff_t first_index, pgoff_t last_index, @@ -4240,17 +4228,20 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; + unsigned long start, size; + struct mmu_gather tlb; vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; zba = max(first_index, vba); zea = min(last_index, vea); + start = ((zba - vba) << PAGE_SHIFT) + vma->vm_start; + size = (zea - zba + 1) << PAGE_SHIFT; - unmap_mapping_range_vma(vma, - ((zba - vba) << PAGE_SHIFT) + vma->vm_start, - ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, - details); + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, start, size, details); + tlb_finish_mmu(&tlb); } } From 599a59e6037838ea7cd6264d7980ea63de244994 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:35 +0100 Subject: [PATCH 142/369] mm/memory: simplify calculation in unmap_mapping_range_tree() Let's simplify the calculation a bit further to make it easier to get, reusing vma_last_pgoff() which we move from interval_tree.c to mm.h. Link: https://lkml.kernel.org/r/20260227200848.114019-5-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ mm/interval_tree.c | 5 ----- mm/memory.c | 12 +++++------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6512d70c5852..771d021b7948 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3969,6 +3969,11 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_last_pgoff(struct vm_area_struct *vma) +{ + return vma->vm_pgoff + vma_pages(vma) - 1; +} + static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) { return desc->end - desc->start; diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 32e390c42c53..32bcfbfcf15f 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -15,11 +15,6 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) return v->vm_pgoff; } -static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) -{ - return v->vm_pgoff + vma_pages(v) - 1; -} - INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) diff --git a/mm/memory.c b/mm/memory.c index f1c5d6b01a62..24b768885379 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4227,17 +4227,15 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; - pgoff_t vba, vea, zba, zea; unsigned long start, size; struct mmu_gather tlb; vma_interval_tree_foreach(vma, root, first_index, last_index) { - vba = vma->vm_pgoff; - vea = vba + vma_pages(vma) - 1; - zba = max(first_index, vba); - zea = min(last_index, vea); - start = ((zba - vba) << PAGE_SHIFT) + vma->vm_start; - size = (zea - zba + 1) << PAGE_SHIFT; + const pgoff_t start_idx = max(first_index, vma->vm_pgoff); + const pgoff_t end_idx = min(last_index, vma_last_pgoff(vma)) + 1; + + start = vma->vm_start + ((start_idx - vma->vm_pgoff) << PAGE_SHIFT); + size = (end_idx - start_idx) << PAGE_SHIFT; tlb_gather_mmu(&tlb, vma->vm_mm); zap_page_range_single_batched(&tlb, vma, start, size, details); From f52f202ddebb17d7e2001ecbe863115c5a9c8c0b Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:36 +0100 Subject: [PATCH 143/369] mm/oom_kill: use MMU_NOTIFY_CLEAR in __oom_reap_task_mm() In commit 7269f999934b ("mm/mmu_notifier: use correct mmu_notifier events for each invalidation") we converted all MMU_NOTIFY_UNMAP to MMU_NOTIFY_CLEAR, except the ones that actually perform munmap() or mremap() as documented. __oom_reap_task_mm() behaves much more like MADV_DONTNEED. So use MMU_NOTIFY_CLEAR as well. This is a preparation for further changes. Link: https://lkml.kernel.org/r/20260227200848.114019-6-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5c6c95c169ee..0ba56fcd10d5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -551,7 +551,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm); From ba25127a8f0cb501c2d8e8879e38d203d044e50c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:37 +0100 Subject: [PATCH 144/369] mm/oom_kill: factor out zapping of VMA into zap_vma_for_reaping() Let's factor it out so we can turn unmap_page_range() into a static function instead, and so oom reaping has a clean interface to call. Note that hugetlb is not supported, because it would require a bunch of hugetlb-specific further actions (see zap_page_range_single_batched()). Link: https://lkml.kernel.org/r/20260227200848.114019-7-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/internal.h | 5 +---- mm/memory.c | 36 ++++++++++++++++++++++++++++++++---- mm/oom_kill.c | 15 +-------------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 84167b0570c9..b0ac179d3a5d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -536,13 +536,10 @@ static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp) } struct zap_details; -void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct zap_details *details); void zap_page_range_single_batched(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long size, struct zap_details *details); +int zap_vma_for_reaping(struct vm_area_struct *vma); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); diff --git a/mm/memory.c b/mm/memory.c index 24b768885379..dbc9a6d0074c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2054,10 +2054,9 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, return addr; } -void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct zap_details *details) +static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details) { pgd_t *pgd; unsigned long next; @@ -2115,6 +2114,35 @@ static void unmap_single_vma(struct mmu_gather *tlb, } } +/** + * zap_vma_for_reaping - zap all page table entries in the vma without blocking + * @vma: The vma to zap. + * + * Zap all page table entries in the vma without blocking for use by the oom + * killer. Hugetlb vmas are not supported. + * + * Returns: 0 on success, -EBUSY if we would have to block. + */ +int zap_vma_for_reaping(struct vm_area_struct *vma) +{ + struct mmu_notifier_range range; + struct mmu_gather tlb; + + VM_WARN_ON_ONCE(is_vm_hugetlb_page(vma)); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, + vma->vm_start, vma->vm_end); + tlb_gather_mmu(&tlb, vma->vm_mm); + if (mmu_notifier_invalidate_range_start_nonblock(&range)) { + tlb_finish_mmu(&tlb); + return -EBUSY; + } + unmap_page_range(&tlb, vma, range.start, range.end, NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); + return 0; +} + /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0ba56fcd10d5..54b7a8fe5136 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -548,21 +548,8 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - struct mmu_notifier_range range; - struct mmu_gather tlb; - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, - mm, vma->vm_start, - vma->vm_end); - tlb_gather_mmu(&tlb, mm); - if (mmu_notifier_invalidate_range_start_nonblock(&range)) { - tlb_finish_mmu(&tlb); + if (zap_vma_for_reaping(vma)) ret = false; - continue; - } - unmap_page_range(&tlb, vma, range.start, range.end, NULL); - mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); } } From 19e48cb98b808983541fac258741610174fe1fca Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:38 +0100 Subject: [PATCH 145/369] mm/memory: rename unmap_single_vma() to __zap_vma_range() Let's rename it to better fit our new naming scheme. Link: https://lkml.kernel.org/r/20260227200848.114019-8-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index dbc9a6d0074c..ca718b0029c4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2074,7 +2074,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } -static void unmap_single_vma(struct mmu_gather *tlb, +static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { @@ -2177,7 +2177,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) unsigned long start = unmap->vma_start; unsigned long end = unmap->vma_end; hugetlb_zap_begin(vma, &start, &end); - unmap_single_vma(tlb, vma, start, end, &details); + __zap_vma_range(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(unmap->mas, unmap->tree_end - 1); } while (vma); @@ -2213,7 +2213,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(tlb, vma, address, end, details); + __zap_vma_range(tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); if (is_vm_hugetlb_page(vma)) { /* From b6c0384a04267350c12875e2fd569a31618e4b11 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:39 +0100 Subject: [PATCH 146/369] mm/memory: move adjusting of address range to unmap_vmas() __zap_vma_range() has two callers, whereby zap_page_range_single_batched() documents that the range must fit into the VMA range. So move adjusting the range to unmap_vmas() where it is actually required and add a safety check in __zap_vma_range() instead. In unmap_vmas(), we'd never expect to have empty ranges (otherwise, why have the vma in there in the first place). __zap_vma_range() will no longer be called with start == end, so cleanup the function a bit. While at it, simplify the overly long comment to its core message. We will no longer call uprobe_munmap() for start == end, which actually seems to be the right thing to do. Note that hugetlb_zap_begin()->...->adjust_range_if_pmd_sharing_possible() cannot result in the range exceeding the vma range. Link: https://lkml.kernel.org/r/20260227200848.114019-9-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/memory.c | 58 +++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ca718b0029c4..7e5d52534ee9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2073,44 +2073,28 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_end_vma(tlb, vma); } - -static void __zap_vma_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, struct zap_details *details) +static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct zap_details *details) { - unsigned long start = max(vma->vm_start, start_addr); - unsigned long end; - - if (start >= vma->vm_end) - return; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) - return; + VM_WARN_ON_ONCE(start >= end || !range_in_vma(vma, start, end)); if (vma->vm_file) uprobe_munmap(vma, start, end); - if (start != end) { - if (unlikely(is_vm_hugetlb_page(vma))) { - /* - * It is undesirable to test vma->vm_file as it - * should be non-null for valid hugetlb area. - * However, vm_file will be NULL in the error - * cleanup path of mmap_region. When - * hugetlbfs ->mmap method fails, - * mmap_region() nullifies vma->vm_file - * before calling this function to clean up. - * Since no pte has actually been setup, it is - * safe to do nothing in this case. - */ - if (vma->vm_file) { - zap_flags_t zap_flags = details ? - details->zap_flags : 0; - __unmap_hugepage_range(tlb, vma, start, end, - NULL, zap_flags); - } - } else - unmap_page_range(tlb, vma, start, end, details); + if (unlikely(is_vm_hugetlb_page(vma))) { + zap_flags_t zap_flags = details ? details->zap_flags : 0; + + /* + * vm_file will be NULL when we fail early while instantiating + * a new mapping. In this case, no pages were mapped yet and + * there is nothing to do. + */ + if (!vma->vm_file) + return; + __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); + } else { + unmap_page_range(tlb, vma, start, end, details); } } @@ -2174,8 +2158,9 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) unmap->vma_start, unmap->vma_end); mmu_notifier_invalidate_range_start(&range); do { - unsigned long start = unmap->vma_start; - unsigned long end = unmap->vma_end; + unsigned long start = max(vma->vm_start, unmap->vma_start); + unsigned long end = min(vma->vm_end, unmap->vma_end); + hugetlb_zap_begin(vma, &start, &end); __zap_vma_range(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); @@ -2204,6 +2189,9 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); + if (unlikely(!size)) + return; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); From a97bc13d15f472c7f8ede1b38660fb55b6dab68d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:40 +0100 Subject: [PATCH 147/369] mm/memory: convert details->even_cows into details->skip_cows The current semantics are confusing: simply because someone specifies an empty zap_detail struct suddenly makes should_zap_cows() behave differently. The default should be to also zap CoW'ed anonymous pages. Really only unmap_mapping_pages() and friends want to skip zapping of these anon folios. So let's invert the meaning; turn the confusing "reclaim_pt" check that overrides other properties in should_zap_cows() into a safety check. Note that the only caller that sets reclaim_pt=true is madvise_dontneed_single_vma(), which wants to zap any pages. Link: https://lkml.kernel.org/r/20260227200848.114019-10-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/madvise.c | 1 - mm/memory.c | 12 ++++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 771d021b7948..cb4f5fbccaf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2767,7 +2767,7 @@ extern void pagefault_out_of_memory(void); */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ - bool even_cows; /* Zap COWed private pages too? */ + bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; diff --git a/mm/madvise.c b/mm/madvise.c index e4a2728593a8..e86228682842 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -853,7 +853,6 @@ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; struct zap_details details = { .reclaim_pt = true, - .even_cows = true, }; zap_page_range_single_batched( diff --git a/mm/memory.c b/mm/memory.c index 7e5d52534ee9..c66b7b8b47eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1554,11 +1554,13 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details || details->reclaim_pt) + if (!details) return true; + VM_WARN_ON_ONCE(details->skip_cows && details->reclaim_pt); + /* Or, we zap COWed pages only if the caller wants to */ - return details->even_cows; + return !details->skip_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ @@ -2149,8 +2151,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, - /* Careful - we need to zap private pages too! */ - .even_cows = true, }; vma = unmap->first; @@ -4282,7 +4282,7 @@ void unmap_mapping_folio(struct folio *folio) first_index = folio->index; last_index = folio_next_index(folio) - 1; - details.even_cows = false; + details.skip_cows = true; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; @@ -4312,7 +4312,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.even_cows = even_cows; + details.skip_cows = !even_cows; if (last_index < first_index) last_index = ULONG_MAX; From 5f10cbbddc2bd80a5944f1c783830f7ebf648ad2 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:41 +0100 Subject: [PATCH 148/369] mm/memory: use __zap_vma_range() in zap_vma_for_reaping() Let's call __zap_vma_range() instead of unmap_page_range() to prepare for further cleanups. To keep the existing behavior, whereby we do not call uprobe_munmap() which could block, add a new "reaping" member to zap_details and use it. Likely we should handle the possible blocking in uprobe_munmap() differently, but for now keep it unchanged. Link: https://lkml.kernel.org/r/20260227200848.114019-11-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/memory.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index cb4f5fbccaf0..488a144c9161 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2769,6 +2769,7 @@ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool skip_cows; /* Do not zap COWed private pages */ bool reclaim_pt; /* Need reclaim page tables? */ + bool reaping; /* Reaping, do not block. */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; diff --git a/mm/memory.c b/mm/memory.c index c66b7b8b47eb..d1fd3cdd677a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2079,14 +2079,18 @@ static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct zap_details *details) { + const bool reaping = details && details->reaping; + VM_WARN_ON_ONCE(start >= end || !range_in_vma(vma, start, end)); - if (vma->vm_file) + /* uprobe_munmap() might sleep, so skip it when reaping. */ + if (vma->vm_file && !reaping) uprobe_munmap(vma, start, end); if (unlikely(is_vm_hugetlb_page(vma))) { zap_flags_t zap_flags = details ? details->zap_flags : 0; + VM_WARN_ON_ONCE(reaping); /* * vm_file will be NULL when we fail early while instantiating * a new mapping. In this case, no pages were mapped yet and @@ -2111,11 +2115,12 @@ static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ int zap_vma_for_reaping(struct vm_area_struct *vma) { + struct zap_details details = { + .reaping = true, + }; struct mmu_notifier_range range; struct mmu_gather tlb; - VM_WARN_ON_ONCE(is_vm_hugetlb_page(vma)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, vma->vm_mm); @@ -2123,7 +2128,7 @@ int zap_vma_for_reaping(struct vm_area_struct *vma) tlb_finish_mmu(&tlb); return -EBUSY; } - unmap_page_range(&tlb, vma, range.start, range.end, NULL); + __zap_vma_range(&tlb, vma, range.start, range.end, &details); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return 0; From 3a31d08d242aeb104814c93a1b93d09e483ddf8e Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:42 +0100 Subject: [PATCH 149/369] mm/memory: inline unmap_page_range() into __zap_vma_range() Let's inline it into the single caller to reduce the number of confusing unmap/zap helpers. Get rid of the unnecessary BUG_ON(). [david@kernel.org: call the local variable simply "addr", per Lorenzo] Link: https://lkml.kernel.org/r/f7732d1c-0e85-4a14-948a-912c417018b5@kernel.org Link: https://lkml.kernel.org/r/20260227200848.114019-12-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/memory.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d1fd3cdd677a..8c77a765036f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2056,25 +2056,6 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, return addr; } -static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct zap_details *details) -{ - pgd_t *pgd; - unsigned long next; - - BUG_ON(addr >= end); - tlb_start_vma(tlb, vma); - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - next = zap_p4d_range(tlb, vma, pgd, addr, next, details); - } while (pgd++, addr = next, addr != end); - tlb_end_vma(tlb, vma); -} - static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct zap_details *details) @@ -2100,7 +2081,18 @@ static void __zap_vma_range(struct mmu_gather *tlb, struct vm_area_struct *vma, return; __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); } else { - unmap_page_range(tlb, vma, start, end, details); + unsigned long next, addr = start; + pgd_t *pgd; + + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = zap_p4d_range(tlb, vma, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); + tlb_end_vma(tlb, vma); } } From 32bc7fe4a6f4d359b6de96cbc106d2cac695154e Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:43 +0100 Subject: [PATCH 150/369] mm: rename zap_vma_pages() to zap_vma() Let's rename it to an even simpler name. While at it, add some simplistic kernel doc. Link: https://lkml.kernel.org/r/20260227200848.114019-13-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/powerpc/platforms/book3s/vas-api.c | 2 +- arch/powerpc/platforms/pseries/vas.c | 2 +- include/linux/mm.h | 6 +++++- lib/vdso/datastore.c | 2 +- mm/page-writeback.c | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index ea4ffa63f043..e96d79db69fe 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -414,7 +414,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) /* * When the LPAR lost credits due to core removal or during * migration, invalidate the existing mapping for the current - * paste addresses and set windows in-active (zap_vma_pages in + * paste addresses and set windows in-active (zap_vma() in * reconfig_close_windows()). * New mapping will be done later after migration or new credits * available. So continue to receive faults if the user space diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index ceb0a8788c0a..fa05f04364fe 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -807,7 +807,7 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * is done before the original mmap() and after the ioctl. */ if (vma) - zap_vma_pages(vma); + zap_vma(vma); mutex_unlock(&task_ref->mmap_mutex); mmap_write_unlock(task_ref->mm); diff --git a/include/linux/mm.h b/include/linux/mm.h index 488a144c9161..60c13d40c65c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2806,7 +2806,11 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size); -static inline void zap_vma_pages(struct vm_area_struct *vma) +/** + * zap_vma - zap all page table entries in a vma + * @vma: The vma to zap. + */ +static inline void zap_vma(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); } diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index a565c30c71a0..222c143aebf7 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -121,7 +121,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) - zap_vma_pages(vma); + zap_vma(vma); } mmap_read_unlock(mm); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1009bb042ba4..8dc47b59ca18 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2645,7 +2645,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * while this function is in progress, although it may have been truncated * before this function is called. Most callers have the folio locked. * A few have the folio blocked from truncation through other means (e.g. - * zap_vma_pages() has it mapped and is holding the page table lock). + * zap_vma() has it mapped and is holding the page table lock). * When called from mark_buffer_dirty(), the filesystem should hold a * reference to the buffer_head that is being marked dirty, which causes * try_to_free_buffers() to fail. From 784a742e7b6db236ef1ccfcdbfe29bf6234b3ccb Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:44 +0100 Subject: [PATCH 151/369] mm: rename zap_page_range_single_batched() to zap_vma_range_batched() Let's make the naming more consistent with our new naming scheme. While at it, polish the kerneldoc a bit. Link: https://lkml.kernel.org/r/20260227200848.114019-14-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- mm/madvise.c | 5 ++--- mm/memory.c | 23 +++++++++++++---------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index b0ac179d3a5d..6e1162e13289 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -536,7 +536,7 @@ static inline void sync_with_folio_pmd_zap(struct mm_struct *mm, pmd_t *pmdp) } struct zap_details; -void zap_page_range_single_batched(struct mmu_gather *tlb, +void zap_vma_range_batched(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long size, struct zap_details *details); int zap_vma_for_reaping(struct vm_area_struct *vma); diff --git a/mm/madvise.c b/mm/madvise.c index e86228682842..a50ec5f90e3e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -855,9 +855,8 @@ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior) .reclaim_pt = true, }; - zap_page_range_single_batched( - madv_behavior->tlb, madv_behavior->vma, range->start, - range->end - range->start, &details); + zap_vma_range_batched(madv_behavior->tlb, madv_behavior->vma, + range->start, range->end - range->start, &details); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 8c77a765036f..879858e466ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2167,17 +2167,20 @@ void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) } /** - * zap_page_range_single_batched - remove user pages in a given range + * zap_vma_range_batched - zap page table entries in a vma range * @tlb: pointer to the caller's struct mmu_gather - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to remove - * @size: number of bytes to remove - * @details: details of shared cache invalidation + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap + * @size: number of bytes to zap + * @details: details specifying zapping behavior * - * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for - * hugetlb, @tlb is flushed and re-initialized by this function. + * @tlb must not be NULL. The provided address range must be fully + * contained within @vma. If @vma is for hugetlb, @tlb is flushed and + * re-initialized by this function. + * + * If @details is NULL, this function will zap all page table entries. */ -void zap_page_range_single_batched(struct mmu_gather *tlb, +void zap_vma_range_batched(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { @@ -2225,7 +2228,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_gather tlb; tlb_gather_mmu(&tlb, vma->vm_mm); - zap_page_range_single_batched(&tlb, vma, address, size, NULL); + zap_vma_range_batched(&tlb, vma, address, size, NULL); tlb_finish_mmu(&tlb); } @@ -4251,7 +4254,7 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, size = (end_idx - start_idx) << PAGE_SHIFT; tlb_gather_mmu(&tlb, vma->vm_mm); - zap_page_range_single_batched(&tlb, vma, start, size, details); + zap_vma_range_batched(&tlb, vma, start, size, details); tlb_finish_mmu(&tlb); } } From 0326440c3545c86b6501c7c636fcf018d6e87b8c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:45 +0100 Subject: [PATCH 152/369] mm: rename zap_page_range_single() to zap_vma_range() Let's rename it to make it better match our new naming scheme. While at it, polish the kerneldoc. [akpm@linux-foundation.org: fix rustfmtcheck] Link: https://lkml.kernel.org/r/20260227200848.114019-15-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Puranjay Mohan Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/s390/mm/gmap_helpers.c | 2 +- drivers/android/binder/page_range.rs | 4 ++-- drivers/android/binder_alloc.c | 2 +- include/linux/mm.h | 4 ++-- kernel/bpf/arena.c | 2 +- kernel/events/core.c | 2 +- mm/madvise.c | 4 ++-- mm/memory.c | 14 +++++++------- net/ipv4/tcp.c | 6 +++--- rust/kernel/mm/virt.rs | 4 ++-- 10 files changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index ae2d59a19313..f8789ffcc05c 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -89,7 +89,7 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo if (!vma) return; if (!is_vm_hugetlb_page(vma)) - zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr); + zap_vma_range(vma, vmaddr, min(end, vma->vm_end) - vmaddr); vmaddr = vma->vm_end; } } diff --git a/drivers/android/binder/page_range.rs b/drivers/android/binder/page_range.rs index 9dfc154e5dd4..8882fd18d9f3 100644 --- a/drivers/android/binder/page_range.rs +++ b/drivers/android/binder/page_range.rs @@ -130,7 +130,7 @@ pub(crate) struct ShrinkablePageRange { pid: Pid, /// The mm for the relevant process. mm: ARef, - /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`. + /// Used to synchronize calls to `vm_insert_page` and `zap_vma_range`. #[pin] mm_lock: Mutex<()>, /// Spinlock protecting changes to pages. @@ -762,7 +762,7 @@ unsafe extern "C" fn rust_shrink_free_page( if let Some(unchecked_vma) = mmap_read.vma_lookup(vma_addr) { if let Some(vma) = check_vma(unchecked_vma, range_ptr) { let user_page_addr = vma_addr + (page_index << PAGE_SHIFT); - vma.zap_page_range_single(user_page_addr, PAGE_SIZE); + vma.zap_vma_range(user_page_addr, PAGE_SIZE); } } diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index dd2046bd5cde..e4488ad86a65 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1185,7 +1185,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range_single(vma, page_addr, PAGE_SIZE); + zap_vma_range(vma, page_addr, PAGE_SIZE); trace_binder_unmap_user_end(alloc, index); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 60c13d40c65c..10a5b9ba4eeb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2804,7 +2804,7 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); /** * zap_vma - zap all page table entries in a vma @@ -2812,7 +2812,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, */ static inline void zap_vma(struct vm_area_struct *vma) { - zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start); + zap_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } struct mmu_notifier_range; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 19cca936eb9d..08d008cc471e 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -656,7 +656,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt); + zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2ecdaabf1b4d..d5b21077e829 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7213,7 +7213,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) - zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE); + zap_vma_range(vma, vma->vm_start, nr_pages * PAGE_SIZE); #endif return err; diff --git a/mm/madvise.c b/mm/madvise.c index a50ec5f90e3e..afe0f01765c4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -832,7 +832,7 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range_single call sets things up for shrink_active_list to actually + * zap_vma_range call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. @@ -1191,7 +1191,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ - zap_page_range_single(vma, range->start, range->end - range->start); + zap_vma_range(vma, range->start, range->end - range->start); } /* diff --git a/mm/memory.c b/mm/memory.c index 879858e466ef..dd80fbf6473a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2215,14 +2215,14 @@ void zap_vma_range_batched(struct mmu_gather *tlb, } /** - * zap_page_range_single - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * zap_vma_range - zap all page table entries in a vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap * - * The range must fit into one VMA. + * The provided address range must be fully contained within @vma. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { struct mmu_gather tlb; @@ -2250,7 +2250,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, !(vma->vm_flags & VM_PFNMAP)) return; - zap_page_range_single(vma, address, size); + zap_vma_range(vma, address, size); } EXPORT_SYMBOL_GPL(zap_vma_ptes); @@ -3018,7 +3018,7 @@ static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long add * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ - zap_page_range_single(vma, addr, size); + zap_vma_range(vma, addr, size); return error; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 89c962672e51..9573ce9b0ac1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2105,7 +2105,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ - zap_page_range_single(vma, *address, maybe_zap_len); + zap_vma_range(vma, *address, maybe_zap_len); err = 0; } @@ -2113,7 +2113,7 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, unsigned long leftover_pages = pages_remaining; int bytes_mapped; - /* We called zap_page_range_single, try to reinsert. */ + /* We called zap_vma_range, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); @@ -2270,7 +2270,7 @@ static int tcp_zerocopy_receive(struct sock *sk, total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) - zap_page_range_single(vma, address, total_bytes_to_map); + zap_vma_range(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index 6bfd91cfa1f4..63eb730b0b05 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -113,7 +113,7 @@ impl VmaRef { /// kernel goes further in freeing unused page tables, but for the purposes of this operation /// we must only assume that the leaf level is cleared. #[inline] - pub fn zap_page_range_single(&self, address: usize, size: usize) { + pub fn zap_vma_range(&self, address: usize, size: usize) { let (end, did_overflow) = address.overflowing_add(size); if did_overflow || address < self.start() || self.end() < end { // TODO: call WARN_ONCE once Rust version of it is added @@ -123,7 +123,7 @@ impl VmaRef { // SAFETY: By the type invariants, the caller has read access to this VMA, which is // sufficient for this method call. This method has no requirements on the vma flags. The // address range is checked to be within the vma. - unsafe { bindings::zap_page_range_single(self.as_ptr(), address, size) }; + unsafe { bindings::zap_vma_range(self.as_ptr(), address, size) }; } /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise From 52a9e9cd181fab8b03cf4e982533224697669976 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:46 +0100 Subject: [PATCH 153/369] mm: rename zap_vma_ptes() to zap_special_vma_range() zap_vma_ptes() is the only zapping function we export to modules. It's essentially a wrapper around zap_vma_range(), however, with some safety checks: * That the passed range fits fully into the VMA * That it's only used for VM_PFNMAP We will add support for VM_MIXEDMAP next, so use the more-generic term "special vma", although "special" is a bit overloaded. Maybe we'll later just support any VM_SPECIAL flag. While at it, improve the kerneldoc. Link: https://lkml.kernel.org/r/20260227200848.114019-16-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Leon Romanovsky [drivers/infiniband] Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/sgx/encl.c | 2 +- drivers/comedi/comedi_fops.c | 2 +- drivers/gpu/drm/i915/i915_mm.c | 4 ++-- drivers/infiniband/core/uverbs_main.c | 6 +++--- drivers/misc/sgi-gru/grumain.c | 2 +- include/linux/mm.h | 2 +- mm/memory.c | 16 +++++++--------- 7 files changed, 16 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index ac60ebde5d9b..3f0222d10f6e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -1220,7 +1220,7 @@ void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) ret = sgx_encl_find(encl_mm->mm, addr, &vma); if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); + zap_special_vma_range(vma, addr, PAGE_SIZE); mmap_read_unlock(encl_mm->mm); diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c index 48a8a607a84c..b91e0b5ac394 100644 --- a/drivers/comedi/comedi_fops.c +++ b/drivers/comedi/comedi_fops.c @@ -2588,7 +2588,7 @@ static int comedi_mmap(struct file *file, struct vm_area_struct *vma) * remap_pfn_range() because we call remap_pfn_range() in a loop. */ if (retval) - zap_vma_ptes(vma, vma->vm_start, size); + zap_special_vma_range(vma, vma->vm_start, size); #endif if (retval == 0) { diff --git a/drivers/gpu/drm/i915/i915_mm.c b/drivers/gpu/drm/i915/i915_mm.c index c33bd3d83069..fd89e7c7d8d6 100644 --- a/drivers/gpu/drm/i915/i915_mm.c +++ b/drivers/gpu/drm/i915/i915_mm.c @@ -108,7 +108,7 @@ int remap_io_mapping(struct vm_area_struct *vma, err = apply_to_page_range(r.mm, addr, size, remap_pfn, &r); if (unlikely(err)) { - zap_vma_ptes(vma, addr, (r.pfn - pfn) << PAGE_SHIFT); + zap_special_vma_range(vma, addr, (r.pfn - pfn) << PAGE_SHIFT); return err; } @@ -156,7 +156,7 @@ int remap_io_sg(struct vm_area_struct *vma, err = apply_to_page_range(r.mm, addr, size, remap_sg, &r); if (unlikely(err)) { - zap_vma_ptes(vma, addr, r.pfn << PAGE_SHIFT); + zap_special_vma_range(vma, addr, r.pfn << PAGE_SHIFT); return err; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 7b68967a6301..f5837da47299 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -756,7 +756,7 @@ out_zap: * point, so zap it. */ vma->vm_private_data = NULL; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + zap_special_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); } static void rdma_umap_close(struct vm_area_struct *vma) @@ -782,7 +782,7 @@ static void rdma_umap_close(struct vm_area_struct *vma) } /* - * Once the zap_vma_ptes has been called touches to the VMA will come here and + * Once the zap_special_vma_range has been called touches to the VMA will come here and * we return a dummy writable zero page for all the pfns. */ static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) @@ -878,7 +878,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) continue; list_del_init(&priv->list); - zap_vma_ptes(vma, vma->vm_start, + zap_special_vma_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); if (priv->entry) { diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 8d749f345246..278b76cbd281 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -542,7 +542,7 @@ void gru_unload_context(struct gru_thread_state *gts, int savestate) int ctxnum = gts->ts_ctxnum; if (!is_kernel_context(gts)) - zap_vma_ptes(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE); + zap_special_vma_range(gts->ts_vma, UGRUADDR(gts), GRU_GSEG_PAGESIZE); cch = get_cch(gru->gs_gru_base_vaddr, ctxnum); gru_dbg(grudev, "gts %p, cbrmap 0x%lx, dsrmap 0x%lx\n", diff --git a/include/linux/mm.h b/include/linux/mm.h index 10a5b9ba4eeb..c516d5177211 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2802,7 +2802,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, pud_t pud); -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index dd80fbf6473a..3dc4664c9af7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2233,17 +2233,15 @@ void zap_vma_range(struct vm_area_struct *vma, unsigned long address, } /** - * zap_vma_ptes - remove ptes mapping the vma - * @vma: vm_area_struct holding ptes to be zapped - * @address: starting address of pages to zap + * zap_special_vma_range - zap all page table entries in a special vma range + * @vma: the vma covering the range to zap + * @address: starting address of the range to zap * @size: number of bytes to zap * - * This function only unmaps ptes assigned to VM_PFNMAP vmas. - * - * The entire address range must be fully contained within the vma. - * + * This function does nothing when the provided address range is not fully + * contained in @vma, or when the @vma is not VM_PFNMAP. */ -void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, +void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || @@ -2252,7 +2250,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, zap_vma_range(vma, address, size); } -EXPORT_SYMBOL_GPL(zap_vma_ptes); +EXPORT_SYMBOL_GPL(zap_special_vma_range); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { From cf2124a90c365cbe2cbeea006b4273374f8d1ecc Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 27 Feb 2026 21:08:47 +0100 Subject: [PATCH 154/369] mm/memory: support VM_MIXEDMAP in zap_special_vma_range() There is demand for also zapping page table entries by drivers in VM_MIXEDMAP VMAs[1]. Nothing really speaks against supporting VM_MIXEDMAP for driver use. We just don't want arbitrary drivers to zap in ordinary (non-special) VMAs. Link: https://lkml.kernel.org/r/20260227200848.114019-17-david@kernel.org Link: https://lore.kernel.org/r/aYSKyr7StGpGKNqW@google.com [1] Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Alexander Gordeev Cc: Alexei Starovoitov Cc: Alice Ryhl Cc: Andrii Nakryiko Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Arve Cc: "Borislav Petkov (AMD)" Cc: Carlos Llamas Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: Daniel Borkman Cc: Dave Airlie Cc: David Ahern Cc: David Rientjes Cc: David S. Miller Cc: Dimitri Sivanich Cc: Eric Dumazet Cc: Gerald Schaefer Cc: Greg Kroah-Hartman Cc: Hartley Sweeten Cc: Heiko Carstens Cc: Ian Abbott Cc: Ingo Molnar Cc: Jakub Kacinski Cc: Jani Nikula Cc: Jann Horn Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jonas Lahtinen Cc: Leon Romanovsky Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Namhyung kim Cc: Neal Cardwell Cc: Paolo Abeni Cc: Pedro Falcato Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Tvrtko Ursulin Cc: Vasily Gorbik Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3dc4664c9af7..b1c062bf5fc1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2239,13 +2239,13 @@ void zap_vma_range(struct vm_area_struct *vma, unsigned long address, * @size: number of bytes to zap * * This function does nothing when the provided address range is not fully - * contained in @vma, or when the @vma is not VM_PFNMAP. + * contained in @vma, or when the @vma is not VM_PFNMAP or VM_MIXEDMAP. */ void zap_special_vma_range(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || - !(vma->vm_flags & VM_PFNMAP)) + !(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) return; zap_vma_range(vma, address, size); From 5a970006786a3b10577e762a9a6c0b9353b4e8a4 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:37 +0800 Subject: [PATCH 155/369] mm: use inline helper functions instead of ugly macros Patch series "support batched checking of the young flag for MGLRU", v3. This is a follow-up to the previous work [1], to support batched checking of the young flag for MGLRU. Similarly, batched checking of young flag for large folios can improve performance during large-folio reclamation when MGLRU is enabled. I observed noticeable performance improvements (see patch 5) on an Arm64 machine that supports contiguous PTEs. All mm-selftests are passed. Patch 1 - 3: cleanup patches. Patch 4: add a new generic batched PTE helper: test_and_clear_young_ptes(). Patch 5: support batched young flag checking for MGLRU. Patch 6: implement the Arm64 arch-specific test_and_clear_young_ptes(). This patch (of 6): People have already complained that these *_clear_young_notify() related macros are very ugly, so let's use inline helpers to make them more readable. In addition, we cannot implement these inline helper functions in the mmu_notifier.h file, because some arch-specific files will include the mmu_notifier.h, which introduces header compilation dependencies and causes build errors (e.g., arch/arm64/include/asm/tlbflush.h). Moreover, since these functions are only used in the mm, implementing these inline helpers in the mm/internal.h header seems reasonable. Link: https://lkml.kernel.org/r/cover.1772778858.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/ea14af84e7967ccebb25082c28a8669d6da8fe57.1772778858.git.baolin.wang@linux.alibaba.com Link: https://lore.kernel.org/all/cover.1770645603.git.baolin.wang@linux.alibaba.com/ [1] Signed-off-by: Baolin Wang Reviewed-by: Rik van Riel Reviewed-by: Barry Song Acked-by: David Hildenbrand (Arm) Cc: Axel Rasmussen Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Cc: Alistair Popple Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 54 ------------------------------------ mm/internal.h | 52 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 8450e18a87c2..3705d350c863 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -516,55 +516,6 @@ static inline void mmu_notifier_range_init_owner( range->owner = owner; } -#define clear_flush_young_ptes_notify(__vma, __address, __ptep, __nr) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - unsigned int ___nr = __nr; \ - __young = clear_flush_young_ptes(___vma, ___address, __ptep, ___nr); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address, \ - ___address + \ - ___nr * PAGE_SIZE); \ - __young; \ -}) - -#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ - __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address, \ - ___address + \ - PMD_SIZE); \ - __young; \ -}) - -#define ptep_clear_young_notify(__vma, __address, __ptep) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ - __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ - ___address + PAGE_SIZE); \ - __young; \ -}) - -#define pmdp_clear_young_notify(__vma, __address, __pmdp) \ -({ \ - int __young; \ - struct vm_area_struct *___vma = __vma; \ - unsigned long ___address = __address; \ - __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ - __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ - ___address + PMD_SIZE); \ - __young; \ -}) - #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { @@ -652,11 +603,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define mmu_notifier_range_update_to_read_only(r) false -#define clear_flush_young_ptes_notify clear_flush_young_ptes -#define pmdp_clear_flush_young_notify pmdp_clear_flush_young -#define ptep_clear_young_notify ptep_test_and_clear_young -#define pmdp_clear_young_notify pmdp_test_and_clear_young - static inline void mmu_notifier_synchronize(void) { } diff --git a/mm/internal.h b/mm/internal.h index 6e1162e13289..321b8019de9f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -1796,4 +1797,55 @@ static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, return remap_pfn_range_complete(vma, addr, pfn, size, prot); } +#ifdef CONFIG_MMU_NOTIFIER +static inline int clear_flush_young_ptes_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + int young; + + young = clear_flush_young_ptes(vma, addr, ptep, nr); + young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, + addr + nr * PAGE_SIZE); + return young; +} + +static inline int pmdp_clear_flush_young_notify(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int young; + + young = pmdp_clear_flush_young(vma, addr, pmdp); + young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, addr + PMD_SIZE); + return young; +} + +static inline int ptep_clear_young_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int young; + + young = ptep_test_and_clear_young(vma, addr, ptep); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); + return young; +} + +static inline int pmdp_clear_young_notify(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int young; + + young = pmdp_test_and_clear_young(vma, addr, pmdp); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); + return young; +} + +#else /* CONFIG_MMU_NOTIFIER */ + +#define clear_flush_young_ptes_notify clear_flush_young_ptes +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young +#define ptep_clear_young_notify ptep_test_and_clear_young +#define pmdp_clear_young_notify pmdp_test_and_clear_young + +#endif /* CONFIG_MMU_NOTIFIER */ + #endif /* __MM_INTERNAL_H */ From 37fb436ff673f33e1ecd2d6f771fd789ed2a6092 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:38 +0800 Subject: [PATCH 156/369] mm: rename ptep/pmdp_clear_young_notify() to ptep/pmdp_test_and_clear_young_notify() Rename ptep/pmdp_clear_young_notify() to ptep/pmdp_test_and_clear_young_notify() to make the function names consistent. Link: https://lkml.kernel.org/r/b3454077ce88745e6f88386b1763721746884565.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand (Arm) Suggested-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/internal.h | 8 ++++---- mm/vmscan.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 321b8019de9f..1b718fdb074e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1819,7 +1819,7 @@ static inline int pmdp_clear_flush_young_notify(struct vm_area_struct *vma, return young; } -static inline int ptep_clear_young_notify(struct vm_area_struct *vma, +static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int young; @@ -1829,7 +1829,7 @@ static inline int ptep_clear_young_notify(struct vm_area_struct *vma, return young; } -static inline int pmdp_clear_young_notify(struct vm_area_struct *vma, +static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int young; @@ -1843,8 +1843,8 @@ static inline int pmdp_clear_young_notify(struct vm_area_struct *vma, #define clear_flush_young_ptes_notify clear_flush_young_ptes #define pmdp_clear_flush_young_notify pmdp_clear_flush_young -#define ptep_clear_young_notify ptep_test_and_clear_young -#define pmdp_clear_young_notify pmdp_test_and_clear_young +#define ptep_test_and_clear_young_notify ptep_test_and_clear_young +#define pmdp_test_and_clear_young_notify pmdp_test_and_clear_young #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 4ab461f8c65a..7ab9e1cdccd2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3533,7 +3533,7 @@ restart: if (!folio) continue; - if (!ptep_clear_young_notify(args->vma, addr, pte + i)) + if (!ptep_test_and_clear_young_notify(args->vma, addr, pte + i)) continue; if (last != folio) { @@ -3624,7 +3624,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!folio) goto next; - if (!pmdp_clear_young_notify(vma, addr, pmd + i)) + if (!pmdp_test_and_clear_young_notify(vma, addr, pmd + i)) goto next; if (last != folio) { @@ -4214,7 +4214,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); - if (!ptep_clear_young_notify(vma, addr, pte)) + if (!ptep_test_and_clear_young_notify(vma, addr, pte)) return false; if (spin_is_contended(pvmw->ptl)) @@ -4260,7 +4260,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (!folio) continue; - if (!ptep_clear_young_notify(vma, addr, pte + i)) + if (!ptep_test_and_clear_young_notify(vma, addr, pte + i)) continue; if (last != folio) { From 83ec1286b173e6ff54d0dd0291fbc517d65a8d5b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:39 +0800 Subject: [PATCH 157/369] mm: rmap: add a ZONE_DEVICE folio warning in folio_referenced() The folio_referenced() is used to test whether a folio was referenced during reclaim. Moreover, ZONE_DEVICE folios are controlled by their device driver, have a lifetime tied to that driver, and are never placed on the LRU list. That means we should never try to reclaim ZONE_DEVICE folios, so add a warning to catch this unexpected behavior in folio_referenced() to avoid confusion, as discussed in the previous thread[1]. [1] https://lore.kernel.org/all/16fb7985-ec0f-4b56-91e7-404c5114f899@kernel.org/ Link: https://lkml.kernel.org/r/64d6fb2a33f7101e1d4aca2c9052e0758b76d492.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Alistair Popple Acked-by: David Hildenbrand (Arm) Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/rmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/rmap.c b/mm/rmap.c index 5fd22ba59d35..cd48f34f11b5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1072,6 +1072,7 @@ int folio_referenced(struct folio *folio, int is_locked, .invalid_vma = invalid_folio_referenced_vma, }; + VM_WARN_ON_ONCE_FOLIO(folio_is_zone_device(folio), folio); *vm_flags = 0; if (!pra.mapcount) return 0; From 6d7237dda44f24bb0dec5dbd2a0ed6be77bf6ef6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:40 +0800 Subject: [PATCH 158/369] mm: add a batched helper to clear the young flag for large folios Currently, MGLRU will call ptep_test_and_clear_young_notify() to check and clear the young flag for each PTE sequentially, which is inefficient for large folios reclamation. Moreover, on Arm64 architecture, which supports contiguous PTEs, the Arm64- specific ptep_test_and_clear_young() already implements an optimization to clear the young flags for PTEs within a contiguous range. However, this is not sufficient. Similar to the Arm64 specific clear_flush_young_ptes(), we can extend this to perform batched operations for the entire large folio (which might exceed the contiguous range: CONT_PTE_SIZE). Thus, we can introduce a new batched helper: test_and_clear_young_ptes() and its wrapper test_and_clear_young_ptes_notify() which are consistent with the existing functions, to perform batched checking of the young flags for large folios, which can help improve performance during large folio reclamation when MGLRU is enabled. And it will be overridden by the architecture that implements a more efficient batch operation in the following patches. Link: https://lkml.kernel.org/r/23ec671bfcc06cd24ee0fbff8e329402742274a0.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Andrew Morton Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: David Hildenbrand (Arm) Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 37 +++++++++++++++++++++++++++++++++++++ mm/internal.h | 16 +++++++++++----- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d2767a4c027b..17d961c612fc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1103,6 +1103,43 @@ static inline int clear_flush_young_ptes(struct vm_area_struct *vma, } #endif +#ifndef test_and_clear_young_ptes +/** + * test_and_clear_young_ptes - Mark PTEs that map consecutive pages of the same + * folio as old + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear access bit. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_test_and_clear_young(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + * + * Returns: whether any PTE was young. + */ +static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + int young = 0; + + for (;;) { + young |= ptep_test_and_clear_young(vma, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } + + return young; +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings diff --git a/mm/internal.h b/mm/internal.h index 1b718fdb074e..1357dc04f065 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1819,13 +1819,13 @@ static inline int pmdp_clear_flush_young_notify(struct vm_area_struct *vma, return young; } -static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline int test_and_clear_young_ptes_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { int young; - young = ptep_test_and_clear_young(vma, addr, ptep); - young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE); + young = test_and_clear_young_ptes(vma, addr, ptep, nr); + young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + nr * PAGE_SIZE); return young; } @@ -1843,9 +1843,15 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, #define clear_flush_young_ptes_notify clear_flush_young_ptes #define pmdp_clear_flush_young_notify pmdp_clear_flush_young -#define ptep_test_and_clear_young_notify ptep_test_and_clear_young +#define test_and_clear_young_ptes_notify test_and_clear_young_ptes #define pmdp_test_and_clear_young_notify pmdp_test_and_clear_young #endif /* CONFIG_MMU_NOTIFIER */ +static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return test_and_clear_young_ptes_notify(vma, addr, ptep, 1); +} + #endif /* __MM_INTERNAL_H */ From 56e5b60b2114dee967c971f08dd29ef193bd3a2d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:41 +0800 Subject: [PATCH 159/369] mm: support batched checking of the young flag for MGLRU Use the batched helper test_and_clear_young_ptes_notify() to check and clear the young flag to improve the performance during large folio reclamation when MGLRU is enabled. Meanwhile, we can also support batched checking the young and dirty flag when MGLRU walks the mm's pagetable to update the folios' generation counter. Since MGLRU also checks the PTE dirty bit, use folio_pte_batch_flags() with FPB_MERGE_YOUNG_DIRTY set to detect batches of PTEs for a large folio. Then we can remove the ptep_test_and_clear_young_notify() since it has no users now. Note that we also update the 'young' counter and 'mm_stats[MM_LEAF_YOUNG]' counter with the batched count in the lru_gen_look_around() and walk_pte_range(). However, the batched operations may inflate these two counters, because in a large folio not all PTEs may have been accessed. (Additionally, tracking how many PTEs have been accessed within a large folio is not very meaningful, since the mm core actually tracks access/dirty on a per-folio basis, not per page). The impact analysis is as follows: 1. The 'mm_stats[MM_LEAF_YOUNG]' counter has no functional impact and is mainly for debugging. 2. The 'young' counter is used to decide whether to place the current PMD entry into the bloom filters by suitable_to_scan() (so that next time we can check whether it has been accessed again), which may set the hash bit in the bloom filters for a PMD entry that hasn't seen much access. However, bloom filters inherently allow some error, so this effect appears negligible. Link: https://lkml.kernel.org/r/378f4acf7d07410aa7c2e4b49d56bb165918eb34.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Rik van Riel Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++-- mm/internal.h | 6 ------ mm/rmap.c | 28 +++++++++++++-------------- mm/vmscan.c | 43 +++++++++++++++++++++++++++++++----------- 4 files changed, 49 insertions(+), 33 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c3ae0348754..3f651baf7e2b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -684,7 +684,7 @@ struct lru_gen_memcg { void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -703,7 +703,8 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } -static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, + unsigned int nr) { return false; } diff --git a/mm/internal.h b/mm/internal.h index 1357dc04f065..4ab833b8bcdf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1848,10 +1848,4 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, #endif /* CONFIG_MMU_NOTIFIER */ -static inline int ptep_test_and_clear_young_notify(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - return test_and_clear_young_ptes_notify(vma, addr, ptep, 1); -} - #endif /* __MM_INTERNAL_H */ diff --git a/mm/rmap.c b/mm/rmap.c index cd48f34f11b5..abe4712a220c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -965,25 +965,20 @@ static bool folio_referenced_one(struct folio *folio, return false; } + if (pvmw.pte && folio_test_large(folio)) { + const unsigned long end_addr = pmd_addr_end(address, vma->vm_end); + const unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; + pte_t pteval = ptep_get(pvmw.pte); + + nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr); + } + if (lru_gen_enabled() && pvmw.pte) { - if (lru_gen_look_around(&pvmw)) + if (lru_gen_look_around(&pvmw, nr)) referenced++; } else if (pvmw.pte) { - if (folio_test_large(folio)) { - unsigned long end_addr = pmd_addr_end(address, vma->vm_end); - unsigned int max_nr = (end_addr - address) >> PAGE_SHIFT; - pte_t pteval = ptep_get(pvmw.pte); - - nr = folio_pte_batch(folio, pvmw.pte, - pteval, max_nr); - } - - ptes += nr; if (clear_flush_young_ptes_notify(vma, address, pvmw.pte, nr)) referenced++; - /* Skip the batched PTEs */ - pvmw.pte += nr - 1; - pvmw.address += (nr - 1) * PAGE_SIZE; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) @@ -993,6 +988,7 @@ static bool folio_referenced_one(struct folio *folio, WARN_ON_ONCE(1); } + ptes += nr; pra->mapcount -= nr; /* * If we are sure that we batched the entire folio, @@ -1002,6 +998,10 @@ static bool folio_referenced_one(struct folio *folio, page_vma_mapped_walk_done(&pvmw); break; } + + /* Skip the batched PTEs */ + pvmw.pte += nr - 1; + pvmw.address += (nr - 1) * PAGE_SIZE; } if (referenced) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ab9e1cdccd2..3a4a0a81c871 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3499,6 +3499,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int gen = lru_gen_from_seq(max_seq); + unsigned int nr; pmd_t pmdval; pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); @@ -3517,11 +3518,13 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, lazy_mmu_mode_enable(); restart: - for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + for (i = pte_index(start), addr = start; addr != end; i += nr, addr += nr * PAGE_SIZE) { unsigned long pfn; struct folio *folio; - pte_t ptent = ptep_get(pte + i); + pte_t *cur_pte = pte + i; + pte_t ptent = ptep_get(cur_pte); + nr = 1; total++; walk->mm_stats[MM_LEAF_TOTAL]++; @@ -3533,7 +3536,16 @@ restart: if (!folio) continue; - if (!ptep_test_and_clear_young_notify(args->vma, addr, pte + i)) + if (folio_test_large(folio)) { + const unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + nr = folio_pte_batch_flags(folio, NULL, cur_pte, &ptent, + max_nr, FPB_MERGE_YOUNG_DIRTY); + total += nr - 1; + walk->mm_stats[MM_LEAF_TOTAL] += nr - 1; + } + + if (!test_and_clear_young_ptes_notify(args->vma, addr, cur_pte, nr)) continue; if (last != folio) { @@ -3546,8 +3558,8 @@ restart: if (pte_dirty(ptent)) dirty = true; - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; + young += nr; + walk->mm_stats[MM_LEAF_YOUNG] += nr; } walk_update_folio(walk, last, gen, dirty); @@ -4191,7 +4203,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr) { int i; bool dirty; @@ -4214,7 +4226,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); - if (!ptep_test_and_clear_young_notify(vma, addr, pte)) + if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr)) return false; if (spin_is_contended(pvmw->ptl)) @@ -4248,10 +4260,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) pte -= (addr - start) / PAGE_SIZE; - for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + for (i = 0, addr = start; addr != end; + i += nr, pte += nr, addr += nr * PAGE_SIZE) { unsigned long pfn; - pte_t ptent = ptep_get(pte + i); + pte_t ptent = ptep_get(pte); + nr = 1; pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; @@ -4260,7 +4274,14 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (!folio) continue; - if (!ptep_test_and_clear_young_notify(vma, addr, pte + i)) + if (folio_test_large(folio)) { + const unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + nr = folio_pte_batch_flags(folio, NULL, pte, &ptent, + max_nr, FPB_MERGE_YOUNG_DIRTY); + } + + if (!test_and_clear_young_ptes_notify(vma, addr, pte, nr)) continue; if (last != folio) { @@ -4273,7 +4294,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pte_dirty(ptent)) dirty = true; - young++; + young += nr; } walk_update_folio(walk, last, gen, dirty); From 9970a9a27ffca8b45c4a242f90adeb979fcaafb0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 6 Mar 2026 14:43:42 +0800 Subject: [PATCH 160/369] arm64: mm: implement the architecture-specific test_and_clear_young_ptes() Implement the Arm64 architecture-specific test_and_clear_young_ptes() to enable batched checking of young flags, improving performance during large folio reclamation when MGLRU is enabled. While we're at it, simplify ptep_test_and_clear_young() by calling test_and_clear_young_ptes(). Since callers guarantee that PTEs are present before calling these functions, we can use pte_cont() to check the CONT_PTE flag instead of pte_valid_cont(). Performance testing: Enable MGLRU, then allocate 10G clean file-backed folios by mmap() in a memory cgroup, and try to reclaim 8G file-backed folios via the memory.reclaim interface. I can observe 60%+ performance improvement on my Arm64 32-core server (and about 15% improvement on my X86 machine). W/o patchset: real 0m0.470s user 0m0.000s sys 0m0.470s W/ patchset: real 0m0.180s user 0m0.001s sys 0m0.179s Link: https://lkml.kernel.org/r/7f891d42a720cc2e57862f3b79e4f774404f313c.1772778858.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Rik van Riel Reviewed-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Axel Rasmussen Cc: Barry Song Cc: Catalin Marinas Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Ryan Roberts Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index aa4b13da6371..ab451d20e4c5 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1812,16 +1812,22 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, return __ptep_get_and_clear(mm, addr, ptep); } +#define test_and_clear_young_ptes test_and_clear_young_ptes +static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + unsigned int nr) +{ + if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) + return __ptep_test_and_clear_young(vma, addr, ptep); + + return contpte_test_and_clear_young_ptes(vma, addr, ptep, nr); +} + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - pte_t orig_pte = __ptep_get(ptep); - - if (likely(!pte_valid_cont(orig_pte))) - return __ptep_test_and_clear_young(vma, addr, ptep); - - return contpte_test_and_clear_young_ptes(vma, addr, ptep, 1); + return test_and_clear_young_ptes(vma, addr, ptep, 1); } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH From 9f2541d9b2fc1ee86415b8d41f6a19cb2a582aac Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 2 Mar 2026 14:50:14 -0500 Subject: [PATCH 161/369] mm: memcg: factor out trylock_stock() and unlock_stock() Patch series "memcg: obj stock and slab stat caching cleanups". This is a follow-up to `[PATCH] memcg: fix slab accounting in refill_obj_stock() trylock path`. The way the slab stat cache and the objcg charge cache interact appears a bit too fragile. This series factors those paths apart as much as practical. This patch (of 5): Consolidate the local lock acquisition and the local stock lookup. This allows subsequent patches to use !!stock as an easy way to disambiguate the locked vs. contended cases through the callstack. Link: https://lkml.kernel.org/r/20260302195305.620713-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20260302195305.620713-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Vlastimil Babka (SUSE) Reviewed-by: Hao Li Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 87614cfc4a3e..5262533d0828 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2950,6 +2950,19 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } +static struct obj_stock_pcp *trylock_stock(void) +{ + if (local_trylock(&obj_stock.lock)) + return this_cpu_ptr(&obj_stock); + + return NULL; +} + +static void unlock_stock(struct obj_stock_pcp *stock) +{ + local_unlock(&obj_stock.lock); +} + static void __account_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) @@ -3005,10 +3018,10 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, struct obj_stock_pcp *stock; bool ret = false; - if (!local_trylock(&obj_stock.lock)) + stock = trylock_stock(); + if (!stock) return ret; - stock = this_cpu_ptr(&obj_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; @@ -3017,7 +3030,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } - local_unlock(&obj_stock.lock); + unlock_stock(stock); return ret; } @@ -3108,7 +3121,8 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, struct obj_stock_pcp *stock; unsigned int nr_pages = 0; - if (!local_trylock(&obj_stock.lock)) { + stock = trylock_stock(); + if (!stock) { if (pgdat) mod_objcg_mlstate(objcg, pgdat, idx, nr_acct); nr_pages = nr_bytes >> PAGE_SHIFT; @@ -3117,7 +3131,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, goto out; } - stock = this_cpu_ptr(&obj_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ drain_obj_stock(stock); obj_cgroup_get(objcg); @@ -3137,7 +3150,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock(&obj_stock.lock); + unlock_stock(stock); out: if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); From 9d181e47098d6911f4a5b3d9feff60c2bf6786a2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 2 Mar 2026 14:50:15 -0500 Subject: [PATCH 162/369] mm: memcg: simplify objcg charge size and stock remainder math Use PAGE_ALIGN() and a more natural cache remainder calculation. Link: https://lkml.kernel.org/r/20260302195305.620713-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Vlastimil Babka (SUSE) Reviewed-by: Hao Li Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5262533d0828..5dd61e35f50d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3159,7 +3159,7 @@ out: static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, struct pglist_data *pgdat, enum node_stat_item idx) { - unsigned int nr_pages, nr_bytes; + size_t charge_size, remainder; int ret; if (likely(consume_obj_stock(objcg, size, pgdat, idx))) @@ -3188,16 +3188,12 @@ static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data * race. */ - nr_pages = size >> PAGE_SHIFT; - nr_bytes = size & (PAGE_SIZE - 1); + charge_size = PAGE_ALIGN(size); + remainder = charge_size - size; - if (nr_bytes) - nr_pages += 1; - - ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && (nr_bytes || pgdat)) - refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, - false, size, pgdat, idx); + ret = obj_cgroup_charge_pages(objcg, gfp, charge_size >> PAGE_SHIFT); + if (!ret && (remainder || pgdat)) + refill_obj_stock(objcg, remainder, false, size, pgdat, idx); return ret; } From edb6abd31bff0746c687cf5af9ecacefaa90e9cb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 2 Mar 2026 14:50:16 -0500 Subject: [PATCH 163/369] mm: memcontrol: split out __obj_cgroup_charge() Move the page charge and remainder calculation into its own function. It will make the slab stat refactor easier to follow. Link: https://lkml.kernel.org/r/20260302195305.620713-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Vlastimil Babka (SUSE) Reviewed-by: Hao Li Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5dd61e35f50d..c01da86e6a2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3156,10 +3156,24 @@ out: obj_cgroup_uncharge_pages(objcg, nr_pages); } +static int __obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, + size_t size, size_t *remainder) +{ + size_t charge_size; + int ret; + + charge_size = PAGE_ALIGN(size); + ret = obj_cgroup_charge_pages(objcg, gfp, charge_size >> PAGE_SHIFT); + if (!ret) + *remainder = charge_size - size; + + return ret; +} + static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, struct pglist_data *pgdat, enum node_stat_item idx) { - size_t charge_size, remainder; + size_t remainder; int ret; if (likely(consume_obj_stock(objcg, size, pgdat, idx))) @@ -3188,10 +3202,7 @@ static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data * race. */ - charge_size = PAGE_ALIGN(size); - remainder = charge_size - size; - - ret = obj_cgroup_charge_pages(objcg, gfp, charge_size >> PAGE_SHIFT); + ret = __obj_cgroup_charge(objcg, gfp, size, &remainder); if (!ret && (remainder || pgdat)) refill_obj_stock(objcg, remainder, false, size, pgdat, idx); From 4665aa7e6523c418d6178ef7e23a4159c72b9d3a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 2 Mar 2026 14:50:17 -0500 Subject: [PATCH 164/369] mm: memcontrol: use __account_obj_stock() in the !locked path Make __account_obj_stock() usable for the case where the local trylock failed. Then switch refill_obj_stock() over to it. This consolidates the mod_objcg_mlstate() call into one place and will make the next patch easier to follow. Link: https://lkml.kernel.org/r/20260302195305.620713-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Vlastimil Babka (SUSE) Reviewed-by: Hao Li Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c01da86e6a2e..15f552a85a52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2969,6 +2969,9 @@ static void __account_obj_stock(struct obj_cgroup *objcg, { int *bytes; + if (!stock) + goto direct; + /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat changes. @@ -3008,6 +3011,7 @@ static void __account_obj_stock(struct obj_cgroup *objcg, nr = 0; } } +direct: if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); } @@ -3124,7 +3128,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock = trylock_stock(); if (!stock) { if (pgdat) - mod_objcg_mlstate(objcg, pgdat, idx, nr_acct); + __account_obj_stock(objcg, NULL, nr_acct, pgdat, idx); nr_pages = nr_bytes >> PAGE_SHIFT; nr_bytes = nr_bytes & (PAGE_SIZE - 1); atomic_add(nr_bytes, &objcg->nr_charged_bytes); From 52af721b9421436a5ee8c301043849f804e3e807 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 2 Mar 2026 14:50:18 -0500 Subject: [PATCH 165/369] mm: memcg: separate slab stat accounting from objcg charge cache Cgroup slab metrics are cached per-cpu the same way as the sub-page charge cache. However, the intertwined code to manage those dependent caches right now is quite difficult to follow. Specifically, cached slab stat updates occur in consume() if there was enough charge cache to satisfy the new object. If that fails, whole pages are reserved, and slab stats are updated when the remainder of those pages, after subtracting the size of the new slab object, are put into the charge cache. This already juggles a delicate mix of the object size, the page charge size, and the remainder to put into the byte cache. Doing slab accounting in this path as well is fragile, and has recently caused a bug where the input parameters between the two caches were mixed up. Refactor the consume() and refill() paths into unlocked and locked variants that only do charge caching. Then let the slab path manage its own lock section and open-code charging and accounting. This makes the slab stat cache subordinate to the charge cache: __refill_obj_stock() is called first to prepare it; __account_obj_stock() follows to hitch a ride. This results in a minor behavioral change: previously, a mismatching percpu stock would always be drained for the purpose of setting up slab account caching, even if there was no byte remainder to put into the charge cache. Now, the stock is left alone, and slab accounting takes the uncached path if there is a mismatch. This is exceedingly rare, and it was probably never worth draining the whole stock just to cache the slab stat update. Link: https://lkml.kernel.org/r/20260302195305.620713-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Hao Li Acked-by: Vlastimil Babka (SUSE) Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 100 +++++++++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 39 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15f552a85a52..47bf034d4b93 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2960,16 +2960,18 @@ static struct obj_stock_pcp *trylock_stock(void) static void unlock_stock(struct obj_stock_pcp *stock) { - local_unlock(&obj_stock.lock); + if (stock) + local_unlock(&obj_stock.lock); } +/* Call after __refill_obj_stock() to ensure stock->cached_objg == objcg */ static void __account_obj_stock(struct obj_cgroup *objcg, struct obj_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) { int *bytes; - if (!stock) + if (!stock || READ_ONCE(stock->cached_objcg) != objcg) goto direct; /* @@ -3016,8 +3018,20 @@ direct: mod_objcg_mlstate(objcg, pgdat, idx, nr); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - struct pglist_data *pgdat, enum node_stat_item idx) +static bool __consume_obj_stock(struct obj_cgroup *objcg, + struct obj_stock_pcp *stock, + unsigned int nr_bytes) +{ + if (objcg == READ_ONCE(stock->cached_objcg) && + stock->nr_bytes >= nr_bytes) { + stock->nr_bytes -= nr_bytes; + return true; + } + + return false; +} + +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { struct obj_stock_pcp *stock; bool ret = false; @@ -3026,14 +3040,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, if (!stock) return ret; - if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { - stock->nr_bytes -= nr_bytes; - ret = true; - - if (pgdat) - __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); - } - + ret = __consume_obj_stock(objcg, stock, nr_bytes); unlock_stock(stock); return ret; @@ -3118,17 +3125,14 @@ static bool obj_stock_flush_required(struct obj_stock_pcp *stock, return flush; } -static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, - enum node_stat_item idx) +static void __refill_obj_stock(struct obj_cgroup *objcg, + struct obj_stock_pcp *stock, + unsigned int nr_bytes, + bool allow_uncharge) { - struct obj_stock_pcp *stock; unsigned int nr_pages = 0; - stock = trylock_stock(); if (!stock) { - if (pgdat) - __account_obj_stock(objcg, NULL, nr_acct, pgdat, idx); nr_pages = nr_bytes >> PAGE_SHIFT; nr_bytes = nr_bytes & (PAGE_SIZE - 1); atomic_add(nr_bytes, &objcg->nr_charged_bytes); @@ -3146,20 +3150,25 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } stock->nr_bytes += nr_bytes; - if (pgdat) - __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); - if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); } - unlock_stock(stock); out: if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); } +static void refill_obj_stock(struct obj_cgroup *objcg, + unsigned int nr_bytes, + bool allow_uncharge) +{ + struct obj_stock_pcp *stock = trylock_stock(); + __refill_obj_stock(objcg, stock, nr_bytes, allow_uncharge); + unlock_stock(stock); +} + static int __obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size, size_t *remainder) { @@ -3174,13 +3183,12 @@ static int __obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, return ret; } -static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, - struct pglist_data *pgdat, enum node_stat_item idx) +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) { size_t remainder; int ret; - if (likely(consume_obj_stock(objcg, size, pgdat, idx))) + if (likely(consume_obj_stock(objcg, size))) return 0; /* @@ -3207,20 +3215,15 @@ static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t * race. */ ret = __obj_cgroup_charge(objcg, gfp, size, &remainder); - if (!ret && (remainder || pgdat)) - refill_obj_stock(objcg, remainder, false, size, pgdat, idx); + if (!ret && remainder) + refill_obj_stock(objcg, remainder, false); return ret; } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) -{ - return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); -} - void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true, 0, NULL, 0); + refill_obj_stock(objcg, size, true); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -3235,6 +3238,7 @@ static inline size_t obj_full_size(struct kmem_cache *s) bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p) { + size_t obj_size = obj_full_size(s); struct obj_cgroup *objcg; struct slab *slab; unsigned long off; @@ -3275,6 +3279,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, for (i = 0; i < size; i++) { unsigned long obj_exts; struct slabobj_ext *obj_ext; + struct obj_stock_pcp *stock; slab = virt_to_slab(p[i]); @@ -3294,9 +3299,20 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, * TODO: we could batch this until slab_pgdat(slab) changes * between iterations, with a more complicated undo */ - if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), - slab_pgdat(slab), cache_vmstat_idx(s))) - return false; + stock = trylock_stock(); + if (!stock || !__consume_obj_stock(objcg, stock, obj_size)) { + size_t remainder; + + unlock_stock(stock); + if (__obj_cgroup_charge(objcg, flags, obj_size, &remainder)) + return false; + stock = trylock_stock(); + if (remainder) + __refill_obj_stock(objcg, stock, remainder, false); + } + __account_obj_stock(objcg, stock, obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); + unlock_stock(stock); obj_exts = slab_obj_exts(slab); get_slab_obj_exts(obj_exts); @@ -3318,6 +3334,7 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; struct slabobj_ext *obj_ext; + struct obj_stock_pcp *stock; unsigned int off; off = obj_to_index(s, slab, p[i]); @@ -3327,8 +3344,13 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_ext->objcg = NULL; - refill_obj_stock(objcg, obj_size, true, -obj_size, - slab_pgdat(slab), cache_vmstat_idx(s)); + + stock = trylock_stock(); + __refill_obj_stock(objcg, stock, obj_size, true); + __account_obj_stock(objcg, stock, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); + unlock_stock(stock); + obj_cgroup_put(objcg); } } From 417607de1f4e6280f646aa42cad5ed84e9228c01 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:28 -0800 Subject: [PATCH 166/369] mm/page_reporting: add PAGE_REPORTING_ORDER_UNSPECIFIED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Allow order zero pages in page reporting", v4. Today, page reporting sets page_reporting_order in two ways: (1) page_reporting.page_reporting_order cmdline parameter (2) Driver can pass order while registering itself. In both cases, order zero is ignored by free page reporting because it is used to set page_reporting_order to a default value, like MAX_PAGE_ORDER. In some cases we might want page_reporting_order to be zero. For instance, when virtio-balloon runs inside a guest with tiny memory (say, 16MB), it might not be able to find a order 1 page (or in the worst case order MAX_PAGE_ORDER page) after some uptime. Page reporting should be able to return order zero pages back for optimal memory relinquishment. This patch changes the default fallback value from '0' to '-1' in all possible clients of free page reporting (hv_balloon and virtio-balloon) together with allowing '0' as a valid order in page_reporting_register(). This patch (of 5): Drivers can pass order of pages to be reported while registering itself. Today, this is a magic number, 0. Label this with PAGE_REPORTING_ORDER_UNSPECIFIED and check for it when the driver is being registered. This macro will be used in relevant drivers next. [akpm@linux-foundation.org: tweak whitespace, per David] Link: https://lkml.kernel.org/r/20260303113032.3008371-1-yuvraj.sakshith@oss.qualcomm.com Link: https://lkml.kernel.org/r/20260303113032.3008371-2-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page_reporting.h | 1 + mm/page_reporting.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index fe648dfa3a7c..d1886c657285 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -7,6 +7,7 @@ /* This value should always be a power of 2, see page_reporting_cycle() */ #define PAGE_REPORTING_CAPACITY 32 +#define PAGE_REPORTING_ORDER_UNSPECIFIED 0 struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ diff --git a/mm/page_reporting.c b/mm/page_reporting.c index f0042d5743af..a2da5bf3a065 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -370,7 +370,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) */ if (page_reporting_order == -1) { - if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER) + if (prdev->order != PAGE_REPORTING_ORDER_UNSPECIFIED && + prdev->order <= MAX_PAGE_ORDER) page_reporting_order = prdev->order; else page_reporting_order = pageblock_order; From f2325daa3a37502156a761cc5c3cc497eb3b0f4c Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:29 -0800 Subject: [PATCH 167/369] virtio_balloon: set unspecified page reporting order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virtio_balloon page reporting order is set to MAX_PAGE_ORDER implicitly as vb->prdev.order is never initialised and is auto-set to zero. Explicitly mention usage of default page order by making use of PAGE_REPORTING_ORDER_UNSPECIFIED fallback value. Link: https://lkml.kernel.org/r/20260303113032.3008371-3-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/virtio/virtio_balloon.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7f15bf162e88..f6c2dff33f8a 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -1022,6 +1022,8 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out_unregister_oom; } + vb->pr_dev_info.order = PAGE_REPORTING_ORDER_UNSPECIFIED; + /* * The default page reporting order is @pageblock_order, which * corresponds to 512MB in size on ARM64 when 64KB base page From fd4bf4f2875a8d0cf3245da670a097fd0259c183 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:30 -0800 Subject: [PATCH 168/369] hv_balloon: set unspecified page reporting order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicitly mention page reporting order to be set to default value using PAGE_REPORTING_ORDER_UNSPECIFIED fallback value. Link: https://lkml.kernel.org/r/20260303113032.3008371-4-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/hv/hv_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index a848400a59a2..9a55f5c43307 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1663,7 +1663,7 @@ static void enable_page_reporting(void) * We let the page_reporting_order parameter decide the order * in the page_reporting code */ - dm_device.pr_dev_info.order = 0; + dm_device.pr_dev_info.order = PAGE_REPORTING_ORDER_UNSPECIFIED; ret = page_reporting_register(&dm_device.pr_dev_info); if (ret < 0) { dm_device.pr_dev_info.report = NULL; From 5467c292d07ffcd55a7a66af2259855f49e1dd06 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:31 -0800 Subject: [PATCH 169/369] mm/page_reporting: change PAGE_REPORTING_ORDER_UNSPECIFIED to -1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PAGE_REPORTING_ORDER_UNSPECIFIED is now set to zero. This means, pages of order zero cannot be reported to a client/driver -- as zero is used to signal a fallback to MAX_PAGE_ORDER. Change PAGE_REPORTING_ORDER_UNSPECIFIED to (-1), so that zero can be used as a valid order with which pages can be reported. Link: https://lkml.kernel.org/r/20260303113032.3008371-5-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page_reporting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index d1886c657285..9d4ca5c218a0 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -7,7 +7,7 @@ /* This value should always be a power of 2, see page_reporting_cycle() */ #define PAGE_REPORTING_CAPACITY 32 -#define PAGE_REPORTING_ORDER_UNSPECIFIED 0 +#define PAGE_REPORTING_ORDER_UNSPECIFIED -1 struct page_reporting_dev_info { /* function that alters pages to make them "reported" */ From 4a34e46eb5e9c762ba4cea4b3c4d89c8e39b1608 Mon Sep 17 00:00:00 2001 From: Yuvraj Sakshith Date: Tue, 3 Mar 2026 03:30:32 -0800 Subject: [PATCH 170/369] mm/page_reporting: change page_reporting_order to PAGE_REPORTING_ORDER_UNSPECIFIED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit page_reporting_order when uninitialised, holds a magic number -1. Since we now maintain PAGE_REPORTING_ORDER_UNSPECIFIED as -1, which is also a flag, set page_reporting_order to this flag. Link: https://lkml.kernel.org/r/20260303113032.3008371-6-yuvraj.sakshith@oss.qualcomm.com Signed-off-by: Yuvraj Sakshith Acked-by: David Hildenbrand (Arm) Reviewed-by: Michael Kelley Acked-by: Michael S. Tsirkin Cc: Brendan Jackman Cc: Dexuan Cui Cc: Eugenio Pérez Cc: Haiyang Zhang Cc: Jason Wang Cc: Johannes Weiner Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Liu Cc: Xuan Zhuo Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_reporting.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index a2da5bf3a065..7418f2e500bb 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -12,7 +12,7 @@ #include "internal.h" /* Initialize to an unsupported value */ -unsigned int page_reporting_order = -1; +unsigned int page_reporting_order = PAGE_REPORTING_ORDER_UNSPECIFIED; static int page_order_update_notify(const char *val, const struct kernel_param *kp) { @@ -369,7 +369,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) * pageblock_order. */ - if (page_reporting_order == -1) { + if (page_reporting_order == PAGE_REPORTING_ORDER_UNSPECIFIED) { if (prdev->order != PAGE_REPORTING_ORDER_UNSPECIFIED && prdev->order <= MAX_PAGE_ORDER) page_reporting_order = prdev->order; From 909632714f687560627f3e8c21fb5f5180373afd Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Tue, 3 Mar 2026 15:56:00 +0530 Subject: [PATCH 171/369] mm/oom_kill.c: simpilfy rcu call with guard(rcu) guard(rcu)() simplifies code readability and there is no need of extra goto labels. Thus replacing rcu_read_lock/unlock with guard(rcu)(). Link: https://lkml.kernel.org/r/20260303102600.105255-1-maninder1.s@samsung.com Signed-off-by: Maninder Singh Acked-by: Michal Hocko Reviewed-by: Dmitry Ilvokhin Acked-by: Shakeel Butt Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/oom_kill.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 54b7a8fe5136..5f372f6e26fa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -135,19 +135,16 @@ struct task_struct *find_lock_task_mm(struct task_struct *p) { struct task_struct *t; - rcu_read_lock(); + guard(rcu)(); for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) - goto found; + return t; task_unlock(t); } - t = NULL; -found: - rcu_read_unlock(); - return t; + return NULL; } /* From d9f74cfb5a9b06d287e855f4c388db1eb40f91e3 Mon Sep 17 00:00:00 2001 From: Bing Jiao Date: Tue, 3 Mar 2026 05:25:17 +0000 Subject: [PATCH 172/369] mm/vmscan: fix unintended mtc->nmask mutation in alloc_demote_folio() In alloc_demote_folio(), mtc->nmask is set to NULL for the first allocation. If that succeeds, it returns without restoring mtc->nmask to allowed_mask. For subsequent allocations from the migrate_pages() batch, mtc->nmask will be NULL. If the target node then becomes full, the fallback allocation will use nmask = NULL, allocating from any node allowed by the task cpuset, which for kswapd is all nodes. To address this issue, use a local copy of the mtc structure with nmask = NULL for the first allocation attempt specifically, ensuring the original mtc remains unmodified. Link: https://lkml.kernel.org/r/20260303052519.109244-1-bingjiao@google.com Fixes: 320080272892 ("mm/demotion: demote pages according to allocation fallback order") Signed-off-by: Bing Jiao Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Axel Rasmussen Cc: Johannes Weiner Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/vmscan.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3a4a0a81c871..641a6063f375 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -985,13 +985,11 @@ static void folio_check_dirty_writeback(struct folio *folio, static struct folio *alloc_demote_folio(struct folio *src, unsigned long private) { + struct migration_target_control *mtc, target_nid_mtc; struct folio *dst; - nodemask_t *allowed_mask; - struct migration_target_control *mtc; mtc = (struct migration_target_control *)private; - allowed_mask = mtc->nmask; /* * make sure we allocate from the target node first also trying to * demote or reclaim pages from the target node via kswapd if we are @@ -1001,15 +999,13 @@ static struct folio *alloc_demote_folio(struct folio *src, * a demotion of cold pages from the target memtier. This can result * in the kernel placing hot pages in slower(lower) memory tiers. */ - mtc->nmask = NULL; - mtc->gfp_mask |= __GFP_THISNODE; - dst = alloc_migration_target(src, (unsigned long)mtc); + target_nid_mtc = *mtc; + target_nid_mtc.nmask = NULL; + target_nid_mtc.gfp_mask |= __GFP_THISNODE; + dst = alloc_migration_target(src, (unsigned long)&target_nid_mtc); if (dst) return dst; - mtc->gfp_mask &= ~__GFP_THISNODE; - mtc->nmask = allowed_mask; - return alloc_migration_target(src, (unsigned long)mtc); } From caf55fef6141c29c095cb1ef7ba84af08ff16734 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 5 Mar 2026 19:56:59 +0100 Subject: [PATCH 173/369] kasan: fix bug type classification for SW_TAGS mode kasan_non_canonical_hook() derives orig_addr from kasan_shadow_to_mem(), but the pointer tag may remain in the top byte. In SW_TAGS mode this tagged address is compared against PAGE_SIZE and TASK_SIZE, which leads to incorrect bug classification. As a result, NULL pointer dereferences may be reported as "wild-memory-access". Strip the tag before performing these range checks and use the untagged value when reporting addresses in these ranges. Before: [ ] Unable to handle kernel paging request at virtual address ffef800000000000 [ ] KASAN: maybe wild-memory-access in range [0xff00000000000000-0xff0000000000000f] After: [ ] Unable to handle kernel paging request at virtual address ffef800000000000 [ ] KASAN: null-ptr-deref in range [0x0000000000000000-0x000000000000000f] Link: https://lkml.kernel.org/r/20260305185659.20807-1-ryabinin.a.a@gmail.com Signed-off-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Maciej Wieczor-Retman Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/report.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 27efb78eb32d..e804b1e1f886 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -638,7 +638,7 @@ void kasan_report_async(void) */ void kasan_non_canonical_hook(unsigned long addr) { - unsigned long orig_addr; + unsigned long orig_addr, user_orig_addr; const char *bug_type; /* @@ -650,6 +650,9 @@ void kasan_non_canonical_hook(unsigned long addr) orig_addr = (unsigned long)kasan_shadow_to_mem((void *)addr); + /* Strip pointer tag before comparing against userspace ranges */ + user_orig_addr = (unsigned long)set_tag((void *)orig_addr, 0); + /* * For faults near the shadow address for NULL, we can be fairly certain * that this is a KASAN shadow memory access. @@ -661,11 +664,13 @@ void kasan_non_canonical_hook(unsigned long addr) * address, but make it clear that this is not necessarily what's * actually going on. */ - if (orig_addr < PAGE_SIZE) + if (user_orig_addr < PAGE_SIZE) { bug_type = "null-ptr-deref"; - else if (orig_addr < TASK_SIZE) + orig_addr = user_orig_addr; + } else if (user_orig_addr < TASK_SIZE) { bug_type = "probably user-memory-access"; - else if (addr_in_shadow((void *)addr)) + orig_addr = user_orig_addr; + } else if (addr_in_shadow((void *)addr)) bug_type = "probably wild-memory-access"; else bug_type = "maybe wild-memory-access"; From e650bb30ca532901da6def04c7d1de72ae59ea4e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:14 +0000 Subject: [PATCH 174/369] mm: rename VMA flag helpers to be more readable Patch series "mm: vma flag tweaks". The ongoing work around introducing non-system word VMA flags has introduced a number of helper functions and macros to make life easier when working with these flags and to make conversions from the legacy use of VM_xxx flags more straightforward. This series improves these to reduce confusion as to what they do and to improve consistency and readability. Firstly the series renames vma_flags_test() to vma_flags_test_any() to make it abundantly clear that this function tests whether any of the flags are set (as opposed to vma_flags_test_all()). It then renames vma_desc_test_flags() to vma_desc_test_any() for the same reason. Note that we drop the 'flags' suffix here, as vma_desc_test_any_flags() would be cumbersome and 'test' implies a flag test. Similarly, we rename vma_test_all_flags() to vma_test_all() for consistency. Next, we have a couple of instances (erofs, zonefs) where we are now testing for vma_desc_test_any(desc, VMA_SHARED_BIT) && vma_desc_test_any(desc, VMA_MAYWRITE_BIT). This is silly, so this series introduces vma_desc_test_all() so these callers can instead invoke vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT). We then observe that quite a few instances of vma_flags_test_any() and vma_desc_test_any() are in fact only testing against a single flag. Using the _any() variant here is just confusing - 'any' of single item reads strangely and is liable to cause confusion. So in these instances the series reintroduces vma_flags_test() and vma_desc_test() as helpers which test against a single flag. The fact that vma_flags_t is a struct and that vma_flag_t utilises sparse to avoid confusion with vm_flags_t makes it impossible for a user to misuse these helpers without it getting flagged somewhere. The series also updates __mk_vma_flags() and functions invoked by it to explicitly mark them always inline to match expectation and to be consistent with other VMA flag helpers. It also renames vma_flag_set() to vma_flags_set_flag() (a function only used by __mk_vma_flags()) to be consistent with other VMA flag helpers. Finally it updates the VMA tests for each of these changes, and introduces explicit tests for vma_flags_test() and vma_desc_test() to assert that they behave as expected. This patch (of 6): On reflection, it's confusing to have vma_flags_test() and vma_desc_test_flags() test whether any comma-separated VMA flag bit is set, while also having vma_flags_test_all() and vma_test_all_flags() separately test whether all flags are set. Firstly, rename vma_flags_test() to vma_flags_test_any() to eliminate this confusion. Secondly, since the VMA descriptor flag functions are becoming rather cumbersome, prefer vma_desc_test*() to vma_desc_test_flags*(), and also rename vma_desc_test_flags() to vma_desc_test_any(). Finally, rename vma_test_all_flags() to vma_test_all() to keep the VMA-specific helper consistent with the VMA descriptor naming convention and to help avoid confusion vs. vma_flags_test_all(). While we're here, also update whitespace to be consistent in helper functions. Link: https://lkml.kernel.org/r/cover.1772704455.git.ljs@kernel.org Link: https://lkml.kernel.org/r/0f9cb3c511c478344fac0b3b3b0300bb95be95e9.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Suggested-by: Pedro Falcato Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- drivers/dax/device.c | 2 +- fs/erofs/data.c | 4 +-- fs/hugetlbfs/inode.c | 2 +- fs/ntfs3/file.c | 2 +- fs/resctrl/pseudo_lock.c | 2 +- fs/zonefs/file.c | 4 +-- include/linux/dax.h | 4 +-- include/linux/hugetlb_inline.h | 2 +- include/linux/mm.h | 48 +++++++++++++++++---------------- mm/hugetlb.c | 14 +++++----- mm/memory.c | 2 +- mm/secretmem.c | 2 +- mm/shmem.c | 4 +-- tools/testing/vma/include/dup.h | 20 +++++++------- tools/testing/vma/tests/vma.c | 28 +++++++++---------- 16 files changed, 72 insertions(+), 70 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index cca4529431f8..5118787d0954 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (vma_desc_test_flags(desc, VMA_SHARED_BIT)) + if (vma_desc_test_any(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 528e81240c4d..381021c2e031 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -24,7 +24,7 @@ static int __check_vma(struct dev_dax *dev_dax, vma_flags_t flags, return -ENXIO; /* prevent private mappings from being established */ - if (!vma_flags_test(&flags, VMA_MAYSHARE_BIT)) { + if (!vma_flags_test_any(&flags, VMA_MAYSHARE_BIT)) { dev_info_ratelimited(dev, "%s: %s: fail, attempted private mapping\n", current->comm, func); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f79ee80627d9..6774d9b5ee82 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -473,8 +473,8 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if (vma_desc_test_flags(desc, VMA_SHARED_BIT) && - vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) + if (vma_desc_test_any(desc, VMA_SHARED_BIT) && + vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2ec3e4231252..079ffaaf1f6c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -164,7 +164,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) goto out; ret = 0; - if (vma_desc_test_flags(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_desc_test_any(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 7eecf1e01f74..c5e2181f9f02 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - const bool rw = vma_desc_test_flags(desc, VMA_WRITE_BIT); + const bool rw = vma_desc_test_any(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index fa3687d69ebd..79a006c6f26c 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!vma_desc_test_flags(desc, VMA_SHARED_BIT)) { + if (!vma_desc_test_any(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 8a7161fc49e5..9f9273ecf71a 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,8 +333,8 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - vma_desc_test_flags(desc, VMA_SHARED_BIT) && - vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) + vma_desc_test_any(desc, VMA_SHARED_BIT) && + vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/dax.h b/include/linux/dax.h index bf103f317cac..535019001577 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_flags(desc, VMA_SYNC_BIT)) + if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_flags(desc, VMA_SYNC_BIT); + return !vma_desc_test_any(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 593f5d4e108b..84afc3c3e2e4 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -13,7 +13,7 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_HUGETLB_BIT); + return vma_flags_test_any(flags, VMA_HUGETLB_BIT); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index c516d5177211..ee7671d6c5eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1062,7 +1062,7 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) (const vma_flag_t []){__VA_ARGS__}) /* Test each of to_test flags in flags, non-atomically. */ -static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, +static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { const unsigned long *bitmap = flags->__vma_flags; @@ -1074,10 +1074,10 @@ static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, /* * Test whether any specified VMA flag is set, e.g.: * - * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + * if (vma_flags_test_any(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } */ -#define vma_flags_test(flags, ...) \ - vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define vma_flags_test_any(flags, ...) \ + vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test that ALL of the to_test flags are set, non-atomically. */ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, @@ -1098,7 +1098,8 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Set each of the to_set flags in flags, non-atomically. */ -static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) +static __always_inline void vma_flags_set_mask(vma_flags_t *flags, + vma_flags_t to_set) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_set = to_set.__vma_flags; @@ -1115,7 +1116,8 @@ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t t vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Clear all of the to-clear flags in flags, non-atomically. */ -static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear) +static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, + vma_flags_t to_clear) { unsigned long *bitmap = flags->__vma_flags; const unsigned long *bitmap_to_clear = to_clear.__vma_flags; @@ -1137,8 +1139,8 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, - vma_flags_t flags) +static inline bool vma_test_all_mask(const struct vm_area_struct *vma, + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } @@ -1146,10 +1148,10 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, /* * Helper macro for checking that ALL specified flags are set in a VMA, e.g.: * - * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } + * if (vma_test_all(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... } */ -#define vma_test_all_flags(vma, ...) \ - vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) /* * Helper to set all VMA flags in a VMA. @@ -1158,7 +1160,7 @@ static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, * you. */ static inline void vma_set_flags_mask(struct vm_area_struct *vma, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); } @@ -1176,25 +1178,25 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, - vma_flags_t flags) +static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, + vma_flags_t flags) { - return vma_flags_test_mask(&desc->vma_flags, flags); + return vma_flags_test_any_mask(&desc->vma_flags, flags); } /* * Helper macro for testing VMA flags for an input pointer to a struct * vm_area_desc object describing a proposed VMA, e.g.: * - * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, + * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } */ -#define vma_desc_test_flags(desc, ...) \ - vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +#define vma_desc_test_any(desc, ...) \ + vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); } @@ -1211,7 +1213,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, /* Helper to clear all VMA flags in a VMA descriptor. */ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) + vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); } @@ -1936,8 +1938,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test(flags, VMA_SHARED_BIT); + return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test_any(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU @@ -1956,7 +1958,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) static inline bool is_nommu_shared_vma_flags(const vma_flags_t *flags) { - return vma_flags_test(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); + return vma_flags_test_any(flags, VMA_MAYSHARE_BIT, VMA_MAYOVERLAY_BIT); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d41fa3dd43e..fbbe74f94426 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1194,7 +1194,7 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } @@ -1202,7 +1202,7 @@ static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *ma static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -6593,7 +6593,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) + if (vma_flags_test_any(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* @@ -6602,7 +6602,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6636,7 +6636,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test_flags(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (desc && !vma_desc_test_any(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6673,7 +6673,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6737,7 +6737,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test_flags(desc, VMA_MAYSHARE_BIT)) + if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/mm/memory.c b/mm/memory.c index b1c062bf5fc1..f21c804b50bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2982,7 +2982,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - VM_WARN_ON_ONCE(!vma_test_all_flags_mask(vma, VMA_REMAP_FLAGS)); + VM_WARN_ON_ONCE(!vma_test_all_mask(vma, VMA_REMAP_FLAGS)); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/secretmem.c b/mm/secretmem.c index 11a779c812a7..5f57ac4720d3 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -122,7 +122,7 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = vma_desc_size(desc); - if (!vma_desc_test_flags(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) + if (!vma_desc_test_any(desc, VMA_SHARED_BIT, VMA_MAYSHARE_BIT)) return -EINVAL; vma_desc_set_flags(desc, VMA_LOCKED_BIT, VMA_DONTDUMP_BIT); diff --git a/mm/shmem.c b/mm/shmem.c index 5e7dcf5bc5d3..965a8908200b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3086,7 +3086,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) + info->flags = vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : @@ -5827,7 +5827,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, unsigned int i_flags) { const unsigned long shmem_flags = - vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; + vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 3078ff1487d3..c46b523e428d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -843,7 +843,7 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ (const vma_flag_t []){__VA_ARGS__}) -static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, +static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { const unsigned long *bitmap = flags->__vma_flags; @@ -852,8 +852,8 @@ static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags, return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS); } -#define vma_flags_test(flags, ...) \ - vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define vma_flags_test_any(flags, ...) \ + vma_flags_test_any_mask(flags, mk_vma_flags(__VA_ARGS__)) static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, vma_flags_t to_test) @@ -889,14 +889,14 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma, +static inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } -#define vma_test_all_flags(vma, ...) \ - vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +#define vma_test_all(vma, ...) \ + vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) { @@ -913,14 +913,14 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc, +static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { - return vma_flags_test_mask(&desc->vma_flags, flags); + return vma_flags_test_any_mask(&desc->vma_flags, flags); } -#define vma_desc_test_flags(desc, ...) \ - vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +#define vma_desc_test_any(desc, ...) \ + vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index c54ffc954f11..f031e6dfb474 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -159,8 +159,8 @@ static bool test_vma_flags_word(void) return true; } -/* Ensure that vma_flags_test() and friends works correctly. */ -static bool test_vma_flags_test(void) +/* Ensure that vma_flags_test_any() and friends works correctly. */ +static bool test_vma_flags_test_any(void) { const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); @@ -171,16 +171,16 @@ static bool test_vma_flags_test(void) desc.vma_flags = flags; #define do_test(...) \ - ASSERT_TRUE(vma_flags_test(&flags, __VA_ARGS__)); \ - ASSERT_TRUE(vma_desc_test_flags(&desc, __VA_ARGS__)) + ASSERT_TRUE(vma_flags_test_any(&flags, __VA_ARGS__)); \ + ASSERT_TRUE(vma_desc_test_any(&desc, __VA_ARGS__)) #define do_test_all_true(...) \ ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__)); \ - ASSERT_TRUE(vma_test_all_flags(&vma, __VA_ARGS__)) + ASSERT_TRUE(vma_test_all(&vma, __VA_ARGS__)) #define do_test_all_false(...) \ ASSERT_FALSE(vma_flags_test_all(&flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_test_all_flags(&vma, __VA_ARGS__)) + ASSERT_FALSE(vma_test_all(&vma, __VA_ARGS__)) /* * Testing for some flags that are present, some that are not - should @@ -200,7 +200,7 @@ static bool test_vma_flags_test(void) * Check _mask variant. We don't need to test extensively as macro * helper is the equivalent. */ - ASSERT_TRUE(vma_flags_test_mask(&flags, flags)); + ASSERT_TRUE(vma_flags_test_any_mask(&flags, flags)); ASSERT_TRUE(vma_flags_test_all_mask(&flags, flags)); /* Single bits. */ @@ -268,9 +268,9 @@ static bool test_vma_flags_clear(void) vma_flags_clear_mask(&flags, mask); vma_flags_clear_mask(&vma.flags, mask); vma_desc_clear_flags_mask(&desc, mask); - ASSERT_FALSE(vma_flags_test(&flags, VMA_EXEC_BIT, 64)); - ASSERT_FALSE(vma_flags_test(&vma.flags, VMA_EXEC_BIT, 64)); - ASSERT_FALSE(vma_desc_test_flags(&desc, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test_any(&flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_flags_test_any(&vma.flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_desc_test_any(&desc, VMA_EXEC_BIT, 64)); /* Reset. */ vma_flags_set(&flags, VMA_EXEC_BIT, 64); vma_set_flags(&vma, VMA_EXEC_BIT, 64); @@ -284,9 +284,9 @@ static bool test_vma_flags_clear(void) vma_flags_clear(&flags, __VA_ARGS__); \ vma_flags_clear(&vma.flags, __VA_ARGS__); \ vma_desc_clear_flags(&desc, __VA_ARGS__); \ - ASSERT_FALSE(vma_flags_test(&flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_flags_test(&vma.flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_desc_test_flags(&desc, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test_any(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_flags_test_any(&vma.flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_desc_test_any(&desc, __VA_ARGS__)); \ vma_flags_set(&flags, __VA_ARGS__); \ vma_set_flags(&vma, __VA_ARGS__); \ vma_desc_set_flags(&desc, __VA_ARGS__) @@ -334,6 +334,6 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(vma_flags_unchanged); TEST(vma_flags_cleared); TEST(vma_flags_word); - TEST(vma_flags_test); + TEST(vma_flags_test_any); TEST(vma_flags_clear); } From 0b3ed2a495b5c10296d9371502d70ce4398f0c58 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:15 +0000 Subject: [PATCH 175/369] mm: add vma_desc_test_all() and use it erofs and zonefs are using vma_desc_test_any() twice to check whether all of VMA_SHARED_BIT and VMA_MAYWRITE_BIT are set, this is silly, so add vma_desc_test_all() to test all flags and update erofs and zonefs to use it. While we're here, update the helper function comments to be more consistent. Also add the same to the VMA test headers. Link: https://lkml.kernel.org/r/568c8f8d6a84ff64014f997517cba7a629f7eed6.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Vlastimil Babka (SUSE) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/erofs/data.c | 3 +-- fs/zonefs/file.c | 3 +-- include/linux/mm.h | 24 ++++++++++++++++++++---- tools/testing/vma/include/dup.h | 9 +++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 6774d9b5ee82..b33dd4d8710e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -473,8 +473,7 @@ static int erofs_file_mmap_prepare(struct vm_area_desc *desc) if (!IS_DAX(file_inode(desc->file))) return generic_file_readonly_mmap_prepare(desc); - if (vma_desc_test_any(desc, VMA_SHARED_BIT) && - vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) + if (vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT)) return -EINVAL; desc->vm_ops = &erofs_dax_vm_ops; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 9f9273ecf71a..5ada33f70bb4 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -333,8 +333,7 @@ static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - vma_desc_test_any(desc, VMA_SHARED_BIT) && - vma_desc_test_any(desc, VMA_MAYWRITE_BIT)) + vma_desc_test_all(desc, VMA_SHARED_BIT, VMA_MAYWRITE_BIT)) return -EINVAL; file_accessed(file); diff --git a/include/linux/mm.h b/include/linux/mm.h index ee7671d6c5eb..f964e4050583 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1177,7 +1177,7 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) -/* Helper to test all VMA flags in a VMA descriptor. */ +/* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { @@ -1185,8 +1185,8 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, } /* - * Helper macro for testing VMA flags for an input pointer to a struct - * vm_area_desc object describing a proposed VMA, e.g.: + * Helper macro for testing whether any VMA flags are set in a VMA descriptor, + * e.g.: * * if (vma_desc_test_any(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } @@ -1194,6 +1194,22 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) +/* Helper to test all VMA flags in a VMA descriptor. */ +static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&desc->vma_flags, flags); +} + +/* + * Helper macro for testing whether ALL VMA flags are set in a VMA descriptor, + * e.g.: + * + * if (vma_desc_test_all(desc, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... } + */ +#define vma_desc_test_all(desc, ...) \ + vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) + /* Helper to set all VMA flags in a VMA descriptor. */ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) @@ -1206,7 +1222,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index c46b523e428d..59788bc14d75 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -922,6 +922,15 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) +static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, + vma_flags_t flags) +{ + return vma_flags_test_all_mask(&desc->vma_flags, flags); +} + +#define vma_desc_test_all(desc, ...) \ + vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) + static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { From a5eee1128de526ba199bd4c7be39b849223e5001 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:16 +0000 Subject: [PATCH 176/369] mm: always inline __mk_vma_flags() and invoked functions Be explicit about __mk_vma_flags() (which is used by the mk_vma_flags() macro) always being inline, as we rely on the compiler to evaluate the loop in this function and determine that it can replace the code with the an equivalent constant value, e.g. that: __mk_vma_flags(2, (const vma_flag_t []){ VMA_WRITE_BIT, VMA_EXEC_BIT }); Can be replaced with: (1UL << VMA_WRITE_BIT) | (1UL << VMA_EXEC_BIT) = (1UL << 1) | (1UL << 2) = 6 Most likely an 'inline' will suffice for this, but be explicit as we can be. Also update all of the functions __mk_vma_flags() ultimately invokes to be always inline too. Note that test_bitmap_const_eval() asserts that the relevant bitmap functions result in build time constant values. Additionally, vma_flag_set() operates on a vma_flags_t type, so it is inconsistently named versus other VMA flags functions. We only use vma_flag_set() in __mk_vma_flags() so we don't need to worry about its new name being rather cumbersome, so rename it to vma_flags_set_flag() to disambiguate it from vma_flags_set(). Also update the VMA test headers to reflect the changes. Link: https://lkml.kernel.org/r/241f49c52074d436edbb9c6a6662a8dc142a8f43.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++++--- include/linux/mm_types.h | 2 +- tools/testing/vma/include/custom.h | 5 +++-- tools/testing/vma/include/dup.h | 5 +++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f964e4050583..9dcdf13570fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1030,21 +1030,23 @@ static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t b } /* Set an individual VMA flag in flags, non-atomically. */ -static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +static __always_inline void vma_flags_set_flag(vma_flags_t *flags, + vma_flag_t bit) { unsigned long *bitmap = flags->__vma_flags; __set_bit((__force int)bit, bitmap); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) { vma_flags_t flags; int i; vma_flags_clear_all(&flags); for (i = 0; i < count; i++) - vma_flag_set(&flags, bits[i]); + vma_flags_set_flag(&flags, bits[i]); return flags; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7bc82a2b889f..f22aecb047b7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1056,7 +1056,7 @@ struct vm_area_struct { } __randomize_layout; /* Clears all bits in the VMA flags bitmap, non-atomically. */ -static inline void vma_flags_clear_all(vma_flags_t *flags) +static __always_inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 802a76317245..833ff4d7f799 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -102,7 +102,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) refcount_set(&vma->vm_refcnt, 0); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) { vma_flags_t flags; int i; @@ -114,6 +115,6 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits) vma_flags_clear_all(&flags); for (i = 0; i < count; i++) if (bits[i] < NUM_VMA_FLAG_BITS) - vma_flag_set(&flags, bits[i]); + vma_flags_set_flag(&flags, bits[i]); return flags; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 59788bc14d75..ef6b9d963acc 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -780,12 +780,13 @@ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) *bitmap &= ~value; } -static inline void vma_flags_clear_all(vma_flags_t *flags) +static __always_inline void vma_flags_clear_all(vma_flags_t *flags) { bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); } -static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit) +static __always_inline void vma_flags_set_flag(vma_flags_t *flags, + vma_flag_t bit) { unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); From 5e6d45d720ca299cc82d84948c4ba622fff64f22 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:17 +0000 Subject: [PATCH 177/369] mm: reintroduce vma_flags_test() as a singular flag test Since we've now renamed vma_flags_test() to vma_flags_test_any() to be very clear as to what we are in fact testing, we now have the opportunity to bring vma_flags_test() back, but for explicitly testing a single VMA flag. This is useful, as often flag tests are against a single flag, and vma_flags_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. We use sparse to enforce that users won't accidentally pass vm_flags_t to this function without it being flagged so this should make it harder to get this wrong. Of course, passing vma_flags_t to the function is impossible, as it is a struct. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/f33f8d7f16c3f3d286a1dc2cba12c23683073134.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +++++++++++++++-- mm/hugetlb.c | 2 +- mm/shmem.c | 4 ++-- tools/testing/vma/include/dup.h | 8 ++++++++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9dcdf13570fb..9392723a5c50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1050,6 +1050,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Test whether a specific VMA flag is set, e.g.: + * + * if (vma_flags_test(flags, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_flags_test(const vma_flags_t *flags, + vma_flag_t bit) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return test_bit((__force int)bit, bitmap); +} + /* * Helper macro which bitwise-or combines the specified input flags into a * vma_flags_t bitmap value. E.g.: @@ -1956,8 +1969,8 @@ static inline bool vma_desc_is_cow_mapping(struct vm_area_desc *desc) { const vma_flags_t *flags = &desc->vma_flags; - return vma_flags_test_any(flags, VMA_MAYWRITE_BIT) && - !vma_flags_test_any(flags, VMA_SHARED_BIT); + return vma_flags_test(flags, VMA_MAYWRITE_BIT) && + !vma_flags_test(flags, VMA_SHARED_BIT); } #ifndef CONFIG_MMU diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fbbe74f94426..9363b6072c0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6593,7 +6593,7 @@ long hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ - if (vma_flags_test_any(&vma_flags, VMA_NORESERVE_BIT)) + if (vma_flags_test(&vma_flags, VMA_NORESERVE_BIT)) return 0; /* diff --git a/mm/shmem.c b/mm/shmem.c index 965a8908200b..5e7dcf5bc5d3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3086,7 +3086,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = vma_flags_test_any(&flags, VMA_NORESERVE_BIT) + info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : @@ -5827,7 +5827,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, unsigned int i_flags) { const unsigned long shmem_flags = - vma_flags_test_any(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; + vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ef6b9d963acc..630478f0d583 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -844,6 +844,14 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ (const vma_flag_t []){__VA_ARGS__}) +static __always_inline bool vma_flags_test(const vma_flags_t *flags, + vma_flag_t bit) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return test_bit((__force int)bit, bitmap); +} + static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { From 0c2aa6635716a5aa19576deef062efab5322072f Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:18 +0000 Subject: [PATCH 178/369] mm: reintroduce vma_desc_test() as a singular flag test Similar to vma_flags_test(), we have previously renamed vma_desc_test() to vma_desc_test_any(). Now that is in place, we can reintroduce vma_desc_test() to explicitly check for a single VMA flag. As with vma_flags_test(), this is useful as often flag tests are against a single flag, and vma_desc_test_any(flags, VMA_READ_BIT) reads oddly and potentially causes confusion. As with vma_flags_test() a combination of sparse and vma_flags_t being a struct means that users cannot misuse this function without it getting flagged. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/3a65ca23defb05060333f0586428fe279a484564.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/char/mem.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/ntfs3/file.c | 2 +- fs/resctrl/pseudo_lock.c | 2 +- include/linux/dax.h | 4 ++-- include/linux/mm.h | 11 +++++++++++ mm/hugetlb.c | 12 ++++++------ tools/testing/vma/include/dup.h | 6 ++++++ 8 files changed, 29 insertions(+), 12 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 5118787d0954..5fd421e48c04 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -520,7 +520,7 @@ static int mmap_zero_prepare(struct vm_area_desc *desc) #ifndef CONFIG_MMU return -ENOSYS; #endif - if (vma_desc_test_any(desc, VMA_SHARED_BIT)) + if (vma_desc_test(desc, VMA_SHARED_BIT)) return shmem_zero_setup_desc(desc); desc->action.success_hook = mmap_zero_private_success; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 079ffaaf1f6c..cd6b22f6e2b1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -164,7 +164,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) goto out; ret = 0; - if (vma_desc_test_any(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_desc_test(desc, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index c5e2181f9f02..fbdfaf989a31 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -276,7 +276,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - const bool rw = vma_desc_test_any(desc, VMA_WRITE_BIT); + const bool rw = vma_desc_test(desc, VMA_WRITE_BIT); int err; /* Avoid any operation if inode is bad. */ diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 79a006c6f26c..d1cb0986006e 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -1044,7 +1044,7 @@ static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!vma_desc_test_any(desc, VMA_SHARED_BIT)) { + if (!vma_desc_test(desc, VMA_SHARED_BIT)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/include/linux/dax.h b/include/linux/dax.h index 535019001577..10a7cc79aea5 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - if (!vma_desc_test_any(desc, VMA_SYNC_BIT)) + if (!vma_desc_test(desc, VMA_SYNC_BIT)) return true; if (!IS_DAX(inode)) return false; @@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc, const struct inode *inode, struct dax_device *dax_dev) { - return !vma_desc_test_any(desc, VMA_SYNC_BIT); + return !vma_desc_test(desc, VMA_SYNC_BIT); } static inline size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9392723a5c50..63d1f619260e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1192,6 +1192,17 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: + * + * if (vma_desc_test(desc, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, + vma_flag_t bit) +{ + return vma_flags_test(&desc->vma_flags, bit); +} + /* Helper to test any VMA flags in a VMA descriptor. */ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9363b6072c0a..992c1632d26a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1194,7 +1194,7 @@ static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = map; } @@ -1202,7 +1202,7 @@ static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *ma static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) { VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test_any(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); desc->private_data = (void *)((unsigned long)desc->private_data | flags); } @@ -6602,7 +6602,7 @@ long hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6636,7 +6636,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test_any(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (desc && !vma_desc_test(desc, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6673,7 +6673,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) { + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6737,7 +6737,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test_any(desc, VMA_MAYSHARE_BIT)) + if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 630478f0d583..5eb313beb43d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -922,6 +922,12 @@ static inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, + vma_flag_t bit) +{ + return vma_flags_test(&desc->vma_flags, bit); +} + static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { From 5cfb95f38a684d0e24eb9e4d9b6f7a34328ed837 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Thu, 5 Mar 2026 10:50:19 +0000 Subject: [PATCH 179/369] tools/testing/vma: add test for vma_flags_test(), vma_desc_test() Now we have helpers which test singular VMA flags - vma_flags_test() and vma_desc_test() - add a test to explicitly assert that these behave as expected. [ljs@kernel.org: test_vma_flags_test(): use struct initializer, per David] Link: https://lkml.kernel.org/r/f6f396d2-1ba2-426f-b756-d8cc5985cc7c@lucifer.local Link: https://lkml.kernel.org/r/376a39eb9e134d2c8ab10e32720dd292970b080a.1772704455.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Cc: Arnd Bergmann Cc: Babu Moger Cc: Baolin Wang Cc: Chao Yu Cc: Chatre, Reinette Cc: Chunhai Guo Cc: Damien Le Maol Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Hongbo Li Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jeffle Xu Cc: Johannes Thumshirn Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naohiro Aota Cc: Oscar Salvador Cc: Sandeep Dhavale Cc: Suren Baghdasaryan Cc: Vishal Verma Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- tools/testing/vma/tests/vma.c | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index f031e6dfb474..f6edd44f4e9e 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -159,6 +159,41 @@ static bool test_vma_flags_word(void) return true; } +/* Ensure that vma_flags_test() and friends works correctly. */ +static bool test_vma_flags_test(void) +{ + const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + struct vm_area_desc desc = { + .vma_flags = flags, + }; + +#define do_test(_flag) \ + ASSERT_TRUE(vma_flags_test(&flags, _flag)); \ + ASSERT_TRUE(vma_desc_test(&desc, _flag)) + +#define do_test_false(_flag) \ + ASSERT_FALSE(vma_flags_test(&flags, _flag)); \ + ASSERT_FALSE(vma_desc_test(&desc, _flag)) + + do_test(VMA_READ_BIT); + do_test(VMA_WRITE_BIT); + do_test(VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 + do_test(64); + do_test(65); +#endif + do_test_false(VMA_MAYWRITE_BIT); +#if NUM_VMA_FLAG_BITS > 64 + do_test_false(66); +#endif + +#undef do_test +#undef do_test_false + + return true; +} + /* Ensure that vma_flags_test_any() and friends works correctly. */ static bool test_vma_flags_test_any(void) { @@ -334,6 +369,7 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(vma_flags_unchanged); TEST(vma_flags_cleared); TEST(vma_flags_word); + TEST(vma_flags_test); TEST(vma_flags_test_any); TEST(vma_flags_clear); } From bc7a2d1b4f0499ceabf9cd4813f118fdbca813f8 Mon Sep 17 00:00:00 2001 From: "Vlastimil Babka (SUSE)" Date: Thu, 5 Mar 2026 09:26:29 +0100 Subject: [PATCH 180/369] MAINTAINERS: add mm-related procfs files to MM sections Some procfs files are very much related to memory management so let's have MAINTAINERS reflect that. Add fs/proc/meminfo.c to MEMORY MANAGEMENT - CORE. Add fs/proc/task_[no]mmu.c to MEMORY MAPPING. Link: https://lkml.kernel.org/r/20260305-maintainers-proc-v1-1-d6d09b3db3b6@kernel.org Signed-off-by: Vlastimil Babka (SUSE) Acked-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Acked-by: SeongJae Park Signed-off-by: Andrew Morton --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7a1b94a4aea2..cf654eba46ee 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16670,6 +16670,7 @@ F: include/linux/pgtable.h F: include/linux/ptdump.h F: include/linux/vmpressure.h F: include/linux/vmstat.h +F: fs/proc/meminfo.c F: kernel/fork.c F: mm/Kconfig F: mm/debug.c @@ -16985,6 +16986,8 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/trace/events/mmap.h +F: fs/proc/task_mmu.c +F: fs/proc/task_nommu.c F: mm/interval_tree.c F: mm/mincore.c F: mm/mlock.c From 92a9cf97a46b806e7f4e4780724203753093b5b1 Mon Sep 17 00:00:00 2001 From: Chengkaitao Date: Sun, 1 Feb 2026 14:35:31 +0800 Subject: [PATCH 181/369] sparc: use vmemmap_populate_hugepages for vmemmap_populate Change sparc's implementation of vmemmap_populate() using vmemmap_populate_hugepages() to streamline the code. Another benefit is that it allows us to eliminate the external declarations of vmemmap_p?d_populate functions and convert them to static functions. Since vmemmap_populate_hugepages may fallback to vmemmap_populate- _basepages, which differs from sparc's original implementation. During the v1 discussion with Mike Rapoport, sparc uses base pages in the kernel page tables, so it should be able to use them in vmemmap as well. Consequently, no additional special handling is required. 1. In the SPARC architecture, reimplement vmemmap_populate using vmemmap_populate_hugepages. 2. Allow the SPARC arch to fallback to vmemmap_populate_basepages(), when vmemmap_alloc_block returns NULL. Link: https://lkml.kernel.org/r/20260201063532.44807-2-pilgrimtao@gmail.com Signed-off-by: Chengkaitao Tested-by: Andreas Larsson Acked-by: Andreas Larsson Cc: David Hildenbrand Cc: David S. Miller Cc: Kevin Brodsky Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Qi Zheng Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/sparc/mm/init_64.c | 47 ++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 3aa47f2b6c6e..367c269305e5 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2562,8 +2562,8 @@ unsigned long _PAGE_CACHE __read_mostly; EXPORT_SYMBOL(_PAGE_CACHE); #ifdef CONFIG_SPARSEMEM_VMEMMAP -int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, - int node, struct vmem_altmap *altmap) +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) { unsigned long pte_base; @@ -2576,39 +2576,24 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, pte_base |= _PAGE_PMD_HUGE; - vstart = vstart & PMD_MASK; - vend = ALIGN(vend, PMD_SIZE); - for (; vstart < vend; vstart += PMD_SIZE) { - pgd_t *pgd = vmemmap_pgd_populate(vstart, node); - unsigned long pte; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pmd_val(*pmd) = pte_base | __pa(p); +} - if (!pgd) - return -ENOMEM; +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_leaf(*pmdp); - p4d = vmemmap_p4d_populate(pgd, vstart, node); - if (!p4d) - return -ENOMEM; + if (large) + vmemmap_verify((pte_t *)pmdp, node, addr, next); - pud = vmemmap_pud_populate(p4d, vstart, node); - if (!pud) - return -ENOMEM; + return large; +} - pmd = pmd_offset(pud, vstart); - pte = pmd_val(*pmd); - if (!(pte & _PAGE_VALID)) { - void *block = vmemmap_alloc_block(PMD_SIZE, node); - - if (!block) - return -ENOMEM; - - pmd_val(*pmd) = pte_base | __pa(block); - } - } - - return 0; +int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, + int node, struct vmem_altmap *altmap) +{ + return vmemmap_populate_hugepages(vstart, vend, node, NULL); } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ From db359fccf212e7fa3136e6edbed6228475646fd7 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Tue, 24 Feb 2026 14:13:47 +0900 Subject: [PATCH 182/369] mm: introduce a new page type for page pool in page type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the condition 'page->pp_magic == PP_SIGNATURE' is used to determine if a page belongs to a page pool. However, with the planned removal of @pp_magic, we should instead leverage the page_type in struct page, such as PGTY_netpp, for this purpose. Introduce and use the page type APIs e.g. PageNetpp(), __SetPageNetpp(), and __ClearPageNetpp() instead, and remove the existing APIs accessing @pp_magic e.g. page_pool_page_is_pp(), netmem_or_pp_magic(), and netmem_clear_pp_magic(). Plus, add @page_type to struct net_iov at the same offset as struct page so as to use the page_type APIs for struct net_iov as well. While at it, reorder @type and @owner in struct net_iov to avoid a hole and increasing the struct size. This work was inspired by the following link: https://lore.kernel.org/all/582f41c0-2742-4400-9c81-0d46bf4e8314@gmail.com/ While at it, move the sanity check for page pool to on the free path. [byungchul@sk.com: gate the sanity check, per Johannes] Link: https://lkml.kernel.org/r/20260316223113.20097-1-byungchul@sk.com Link: https://lkml.kernel.org/r/20260224051347.19621-1-byungchul@sk.com Co-developed-by: Pavel Begunkov Signed-off-by: Pavel Begunkov Signed-off-by: Byungchul Park Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Vlastimil Babka Reviewed-by: Toke Høiland-Jørgensen Acked-by: Mike Rapoport (Microsoft) Acked-by: Johannes Weiner Acked-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Cc: Alexei Starovoitov Cc: Andrew Lunn Cc: Baolin Wang Cc: Brendan Jackman Cc: Christian Brauner Cc: Daniel Borkmann Cc: David S. Miller Cc: David Wei Cc: Dragos Tatulea Cc: Eric Dumazet Cc: John Fastabend Cc: Leon Romanovsky Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Bloch Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mina Almasry Cc: Paolo Abeni Cc: Saeed Mahameed Cc: Simon Horman Cc: Stanislav Fomichev Cc: Stehen Rothwell Cc: Suren Baghdasaryan Cc: Taehee Yoo Cc: Tariq Toukan Cc: Usama Arif Cc: Yu Zhao Signed-off-by: Andrew Morton --- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- include/linux/mm.h | 27 +++---------------- include/linux/page-flags.h | 6 +++++ include/net/netmem.h | 15 +++++++++-- mm/page_alloc.c | 13 ++++++--- net/core/netmem_priv.h | 23 +++++++--------- net/core/page_pool.c | 24 +++++++++++++++-- 7 files changed, 64 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 80f9fc10877a..7d90d2485c78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -707,7 +707,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check page_pool_page_is_pp() as we + /* No need to check PageNetpp() as we * know this is a page_pool page. */ page_pool_recycle_direct(pp_page_to_nmdesc(page)->pp, diff --git a/include/linux/mm.h b/include/linux/mm.h index 63d1f619260e..c758f4e68727 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4840,10 +4840,9 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and - * stashes it in the upper bits of page->pp_magic. We always want to be able to - * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP - * pages can have arbitrary kernel pointers stored in the same field as pp_magic - * (since it overlaps with page->lru.next), so we must ensure that we cannot + * stashes it in the upper bits of page->pp_magic. Non-PP pages can have + * arbitrary kernel pointers stored in the same field as pp_magic (since + * it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * @@ -4878,26 +4877,6 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) -/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is - * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page, as well as the - * bits used for the DMA index. page_is_pfmemalloc() is checked in - * __page_pool_put_page() to avoid recycling the pfmemalloc page. - */ -#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) - -#ifdef CONFIG_PAGE_POOL -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; -} -#else -static inline bool page_pool_page_is_pp(const struct page *page) -{ - return false; -} -#endif - #define PAGE_SNAPSHOT_FAITHFUL (1 << 0) #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) #define PAGE_SNAPSHOT_PG_IDLE (1 << 2) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7223f6f4e2b4..0e03d816e8b9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -923,6 +923,7 @@ enum pagetype { PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, + PGTY_netpp = 0xf9, PGTY_mapcount_underflow = 0xff }; @@ -1055,6 +1056,11 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) +/* + * Marks page_pool allocated pages. + */ +PAGE_TYPE_OPS(Netpp, netpp, netpp) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. diff --git a/include/net/netmem.h b/include/net/netmem.h index a96b3e5e5574..85e3b26ec547 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -110,10 +110,21 @@ struct net_iov { atomic_long_t pp_ref_count; }; }; - struct net_iov_area *owner; + + unsigned int page_type; enum net_iov_type type; + struct net_iov_area *owner; }; +/* Make sure 'the offset of page_type in struct page == the offset of + * type in struct net_iov'. + */ +#define NET_IOV_ASSERT_OFFSET(pg, iov) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct net_iov, iov)) +NET_IOV_ASSERT_OFFSET(page_type, page_type); +#undef NET_IOV_ASSERT_OFFSET + struct net_iov_area { /* Array of net_iovs for this area. */ struct net_iov *niovs; @@ -256,7 +267,7 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) */ #define pp_page_to_nmdesc(p) \ ({ \ - DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + DEBUG_NET_WARN_ON_ONCE(!PageNetpp(p)); \ __pp_page_to_nmdesc(p); \ }) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f11f38ba2e12..fdcc2fde565b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1043,7 +1043,6 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif - page_pool_page_is_pp(page) | (page->flags.f & check_flags))) return false; @@ -1070,8 +1069,6 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(page_pool_page_is_pp(page))) - bad_reason = "page_pool leak"; return bad_reason; } @@ -1380,9 +1377,17 @@ __always_inline bool __free_pages_prepare(struct page *page, mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); folio->mapping = NULL; } - if (unlikely(page_has_type(page))) + if (unlikely(page_has_type(page))) { + /* networking expects to clear its page type before releasing */ + if (is_check_pages_enabled()) { + if (unlikely(PageNetpp(page))) { + bad_page(page, "page_pool leak"); + return false; + } + } /* Reset the page_type (which overlays _mapcount) */ page->page_type = UINT_MAX; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 23175cb2bd86..3e6fde8f1726 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -8,21 +8,18 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } -static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) -{ - netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; -} - -static inline void netmem_clear_pp_magic(netmem_ref netmem) -{ - WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - - netmem_to_nmdesc(netmem)->pp_magic = 0; -} - static inline bool netmem_is_pp(netmem_ref netmem) { - return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; + struct page *page; + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + return PageNetpp(page); } static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 265a729431bb..877bbf7a1938 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -702,8 +702,18 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { + struct page *page; + netmem_set_pp(netmem, pool); - netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + __SetPageNetpp(page); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it @@ -718,7 +728,17 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) void page_pool_clear_pp_info(netmem_ref netmem) { - netmem_clear_pp_magic(netmem); + struct page *page; + + /* XXX: Now that the offset of page_type is shared between + * struct page and net_iov, just cast the netmem to struct page + * unconditionally by clearing NET_IOV if any, no matter whether + * it comes from struct net_iov or struct page. This should be + * adjusted once the offset is no longer shared. + */ + page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); + __ClearPageNetpp(page); + netmem_set_pp(netmem, NULL); } From d7651089939bf00fe033845e662905ce750b1f18 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Mar 2026 16:05:49 +0100 Subject: [PATCH 183/369] ubsan: turn off kmsan inside of ubsan instrumentation The structure initialization in the two type mismatch handling functions causes a call to __msan_memset() to be generated inside of a UACCESS block, which in turn leads to an objtool warning about possibly leaking uaccess-enabled state: lib/ubsan.o: warning: objtool: __ubsan_handle_type_mismatch+0xda: call to __msan_memset() with UACCESS enabled lib/ubsan.o: warning: objtool: __ubsan_handle_type_mismatch_v1+0xf4: call to __msan_memset() with UACCESS enabled Most likely __msan_memset() is safe to be called here and could be added to the uaccess_safe_builtin[] list of safe functions, but seeing that the ubsan file itself already has kasan, ubsan and kcsan disabled itself, it is probably a good idea to also turn off kmsan here, in particular this also avoids the risk of recursing between ubsan and kcsan checks in other functions of this file. I saw this happen while testing randconfig builds with clang-22, but did not try older versions, or attempt to see which kernel change introduced the warning. Link: https://lkml.kernel.org/r/20260306150613.350029-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Andrey Ryabinin Cc: Kees Cook Cc: Marco Elver Cc: Andrey Konovalov Cc: Bill Wendling Cc: Justin Stitt Cc: Nathan Chancellor Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Makefile b/lib/Makefile index 1b9ee167517f..ce3ae929ae8c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -307,6 +307,7 @@ obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n KASAN_SANITIZE_ubsan.o := n KCSAN_SANITIZE_ubsan.o := n +KMSAN_SANITIZE_ubsan.o := n CFLAGS_ubsan.o := -fno-stack-protector $(DISABLE_KSTACK_ERASE) obj-$(CONFIG_SBITMAP) += sbitmap.o From 5a14198ec63c130b7377911d3cf99c93547d5fb8 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Mon, 9 Mar 2026 14:25:02 -0700 Subject: [PATCH 184/369] mm/migrate_device: document folio_get requirement before frozen PMD split split_huge_pmd_address() with freeze=true splits a PMD migration entry into PTE migration entries, consuming one folio reference in the process. The folio_get() before it provides this reference. Add a comment explaining this relationship. The expected folio refcount at the start of migrate_vma_split_unmapped_folio() is 1. Link: https://lkml.kernel.org/r/20260309212502.3922825-1-usama.arif@linux.dev Signed-off-by: Usama Arif Suggested-by: Zi Yan Reviewed-by: Zi Yan Reviewed-by: Nico Pache Acked-by: David Hildenbrand (Arm) Reviewed-by: Wei Yang Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Joshua Hahn Cc: Matthew Brost Cc: Rakie Kim Cc: Ying Huang Signed-off-by: Andrew Morton --- mm/migrate_device.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8079676c8f1f..2912eba575d5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -914,6 +914,10 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, unsigned long flags; int ret = 0; + /* + * take a reference, since split_huge_pmd_address() with freeze = true + * drops a reference at the end. + */ folio_get(folio); split_huge_pmd_address(migrate->vma, addr, true); ret = folio_split_unmapped(folio, 0); From 62f058287558af11f6d1af3d7597ff8599998b43 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:04 -0800 Subject: [PATCH 185/369] mm/damon: add CONFIG_DAMON_DEBUG_SANITY Patch series "mm/damon: add optional debugging-purpose sanity checks". DAMON code has a few assumptions that can be critical if violated. Validating the assumptions in code can be useful at finding such critical bugs. I was actually adding some such additional sanity checks in my personal tree, and those were useful at finding bugs that I made during the development of new patches. We also found [1] sometimes the assumptions are misunderstood. The validation can work as good documentation for such cases. Add some of such debugging purpose sanity checks. Because those additional checks can impose more overhead, make those only optional via new config, CONFIG_DAMON_DEBUG_SANITY, that is recommended for only development and test setups. And as recommended, enable it for DAMON kunit tests and selftests. Note that the verification only WARN_ON() for each of the insanity. The developer or tester may better to set panic_on_oops together, like damon-tests/corr did [2]. This patch (of 10): Add a new build config that will enable additional DAMON sanity checks. It is recommended to be enabled on only development and test setups, since it can impose additional overhead. Link: https://lkml.kernel.org/r/20260306152914.86303-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260306152914.86303-2-sj@kernel.org Link: https://lore.kernel.org/20251231070029.79682-1-sj@kernel.org [1] Link: https://github.com/damonitor/damon-tests/commit/a80fbee55e272f151b4e5809ee85898aea33e6ff [2] Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 8c868f7035fc..34631a44cdec 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -12,6 +12,17 @@ config DAMON See https://www.kernel.org/doc/html/latest/mm/damon/index.html for more information. +config DAMON_DEBUG_SANITY + bool "Check sanity of DAMON code" + depends on DAMON + help + This enables additional DAMON debugging-purpose sanity checks in + DAMON code. This can be useful for finding bugs, but impose + additional overhead. This is therefore recommended to be enabled on + only development and test setups. + + If unsure, say N. + config DAMON_KUNIT_TEST bool "Test for damon" if !KUNIT_ALL_TESTS depends on DAMON && KUNIT=y From b0264a951ced68313cbbd9f679c907a741004b58 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:05 -0800 Subject: [PATCH 186/369] mm/damon/core: add damon_new_region() debug_sanity check damon_new_region() is supposed to be called with only valid address range arguments. Do the check under DAMON_DEBUG_SANITY. Link: https://lkml.kernel.org/r/20260306152914.86303-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 71ccea40368d..53b573b915c7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -109,6 +109,17 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) return err; } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_new_region(unsigned long start, unsigned long end) +{ + WARN_ONCE(start >= end, "start %lu >= end %lu\n", start, end); +} +#else +static void damon_verify_new_region(unsigned long start, unsigned long end) +{ +} +#endif + /* * Construct a damon_region struct * @@ -118,6 +129,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; + damon_verify_new_region(start, end); region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); if (!region) return NULL; From 9a647920d03d9da5ca4868d853a8f9267de1e070 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:06 -0800 Subject: [PATCH 187/369] mm/damon/core: add damon_del_region() debug_sanity check damon_del_region() should be called for targets that have one or more regions. Add a sanity check for that under CONFIG_DAMON_DEBUG_SANITY. Link: https://lkml.kernel.org/r/20260306152914.86303-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 53b573b915c7..c499a02ac44e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -152,8 +152,21 @@ void damon_add_region(struct damon_region *r, struct damon_target *t) t->nr_regions++; } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_del_region(struct damon_target *t) +{ + WARN_ONCE(t->nr_regions == 0, "t->nr_regions == 0\n"); +} +#else +static void damon_verify_del_region(struct damon_target *t) +{ +} +#endif + static void damon_del_region(struct damon_region *r, struct damon_target *t) { + damon_verify_del_region(t); + list_del(&r->list); t->nr_regions--; } From 242a764abe149f297273be18fdb66cfeaf27dec2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:07 -0800 Subject: [PATCH 188/369] mm/damon/core: add damon_nr_regions() debug_sanity check damon_target->nr_regions is introduced to get the number quickly without having to iterate regions always. Add a sanity check for that under CONFIG_DAMON_DEBUG_SANITY. Link: https://lkml.kernel.org/r/20260306152914.86303-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index c499a02ac44e..16bedde920f0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -545,8 +545,27 @@ void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx) damon_free_target(t); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_nr_regions(struct damon_target *t) +{ + struct damon_region *r; + unsigned int count = 0; + + damon_for_each_region(r, t) + count++; + WARN_ONCE(count != t->nr_regions, "t->nr_regions (%u) != count (%u)\n", + t->nr_regions, count); +} +#else +static void damon_verify_nr_regions(struct damon_target *t) +{ +} +#endif + unsigned int damon_nr_regions(struct damon_target *t) { + damon_verify_nr_regions(t); + return t->nr_regions; } From 0bb7682fdb533be8412e5678eab246144d855138 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:08 -0800 Subject: [PATCH 189/369] mm/damon/core: add damon_merge_two_regions() debug_sanity check A data corruption could cause damon_merge_two_regions() creating zero length DAMON regions. Add a sanity check for that under CONFIG_DAMON_DEBUG_SANITY. Link: https://lkml.kernel.org/r/20260306152914.86303-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 16bedde920f0..0fa694aff617 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2475,6 +2475,21 @@ static void kdamond_apply_schemes(struct damon_ctx *c) mutex_unlock(&c->walk_control_lock); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_merge_two_regions( + struct damon_region *l, struct damon_region *r) +{ + /* damon_merge_two_regions() may created incorrect left region */ + WARN_ONCE(l->ar.start >= l->ar.end, "l: %lu-%lu, r: %lu-%lu\n", + l->ar.start, l->ar.end, r->ar.start, r->ar.end); +} +#else +static void damon_verify_merge_two_regions( + struct damon_region *l, struct damon_region *r) +{ +} +#endif + /* * Merge two adjacent regions into one region */ @@ -2488,6 +2503,7 @@ static void damon_merge_two_regions(struct damon_target *t, l->nr_accesses_bp = l->nr_accesses * 10000; l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; + damon_verify_merge_two_regions(l, r); damon_destroy_region(r, t); } From c070da23913f66b7c08340a137622ba2f2203616 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:09 -0800 Subject: [PATCH 190/369] mm/damon/core: add damon_merge_regions_of() debug_sanity check damon_merge_regions_of() should be called only after aggregation is finished and therefore each region's nr_accesses and nr_accesses_bp match. There were bugs that broke the assumption, during development of online DAMON parameter updates and monitoring results handling changes. Add a sanity check for that under CONFIG_DAMON_DEBUG_SANITY. Link: https://lkml.kernel.org/r/20260306152914.86303-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 0fa694aff617..3f55dfcb54e4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2507,6 +2507,20 @@ static void damon_merge_two_regions(struct damon_target *t, damon_destroy_region(r, t); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_merge_regions_of(struct damon_region *r) +{ + WARN_ONCE(r->nr_accesses != r->nr_accesses_bp / 10000, + "nr_accesses (%u) != nr_accesses_bp (%u)\n", + r->nr_accesses, r->nr_accesses_bp); +} +#else +static void damon_verify_merge_regions_of(struct damon_region *r) +{ +} +#endif + + /* * Merge adjacent regions having similar access frequencies * @@ -2520,6 +2534,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { + damon_verify_merge_regions_of(r); if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; else if ((r->nr_accesses == 0) != (r->last_nr_accesses == 0)) From 6aa1f783547896df3db1cf3dc5198f03df5ce775 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:10 -0800 Subject: [PATCH 191/369] mm/damon/core: add damon_split_region_at() debug_sanity check damon_split_region_at() should be called with the correct address to split on. Add a sanity check for that under CONFIG_DAMON_DEBUG_SANITY. Link: https://lkml.kernel.org/r/20260306152914.86303-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 3f55dfcb54e4..f3e5400914cb 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2588,6 +2588,21 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, threshold / 2 < max_thres); } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_split_region_at(struct damon_region *r, + unsigned long sz_r) +{ + WARN_ONCE(sz_r == 0 || sz_r >= damon_sz_region(r), + "sz_r: %lu r: %lu-%lu (%lu)\n", + sz_r, r->ar.start, r->ar.end, damon_sz_region(r)); +} +#else +static void damon_verify_split_region_at(struct damon_region *r, + unsigned long sz_r) +{ +} +#endif + /* * Split a region in two * @@ -2599,6 +2614,7 @@ static void damon_split_region_at(struct damon_target *t, { struct damon_region *new; + damon_verify_split_region_at(r, sz_r); new = damon_new_region(r->ar.start + sz_r, r->ar.end); if (!new) return; From c556187b6e247b5d817b4392fe2ba6d299515795 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:11 -0800 Subject: [PATCH 192/369] mm/damon/core: add damon_reset_aggregated() debug_sanity check At time of damon_reset_aggregated(), aggregation of the interval should be completed, and hence nr_accesses and nr_accesses_bp should match. I found a few bugs caused it to be broken in the past, from online parameters update and complicated nr_accesses handling changes. Add a sanity check for that under CONFIG_DAMON_DEBUG_SANITY. Link: https://lkml.kernel.org/r/20260306152914.86303-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index f3e5400914cb..66d8e9b1adcf 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1668,6 +1668,23 @@ static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r) r->nr_accesses_bp = r->nr_accesses * 10000; } +#ifdef CONFIG_DAMON_DEBUG_SANITY +static void damon_verify_reset_aggregated(struct damon_region *r, + struct damon_ctx *c) +{ + WARN_ONCE(r->nr_accesses_bp != r->last_nr_accesses * 10000, + "nr_accesses_bp %u last_nr_accesses %u sis %lu %lu\n", + r->nr_accesses_bp, r->last_nr_accesses, + c->passed_sample_intervals, c->next_aggregation_sis); +} +#else +static void damon_verify_reset_aggregated(struct damon_region *r, + struct damon_ctx *c) +{ +} +#endif + + /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1684,6 +1701,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; + damon_verify_reset_aggregated(r, c); } ti++; } From 09cbdf7dbe2334d32853ad3ba3b54df017d7a37b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:12 -0800 Subject: [PATCH 193/369] mm/damon/tests/.kunitconifg: enable DAMON_DEBUG_SANITY CONFIG_DAMON_DEBUG_SANITY is recommended for DAMON development and test setups. Enable it on the default configurations for DAMON kunit test run. Link: https://lkml.kernel.org/r/20260306152914.86303-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/tests/.kunitconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig index 36a450f57b58..144d27e6ecc5 100644 --- a/mm/damon/tests/.kunitconfig +++ b/mm/damon/tests/.kunitconfig @@ -13,3 +13,6 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y CONFIG_SYSFS=y CONFIG_DAMON_SYSFS=y CONFIG_DAMON_SYSFS_KUNIT_TEST=y + +# enable DAMON_DEBUG_SANITY to catch any bug +CONFIG_DAMON_DEBUG_SANITY=y From 300252ebb116e8f9908a5f52203984b1bc15cc78 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 6 Mar 2026 07:29:13 -0800 Subject: [PATCH 194/369] selftests/damon/config: enable DAMON_DEBUG_SANITY CONFIG_DAMON_DEBUG_SANITY is recommended for DAMON development and test setups. Enable it on the build config for DAMON selftests. Link: https://lkml.kernel.org/r/20260306152914.86303-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/config b/tools/testing/selftests/damon/config index a68a9fead5dc..6304adacb741 100644 --- a/tools/testing/selftests/damon/config +++ b/tools/testing/selftests/damon/config @@ -4,3 +4,4 @@ CONFIG_DAMON_PADDR=y CONFIG_DAMON_VADDR=y CONFIG_DAMON_RECLAIM=y CONFIG_DAMON_LRU_SORT=y +CONFIG_DAMON_DEBUG_SANITY=y From a260de7d45ea9144645748b1c54896dea6f79655 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:42:20 -0800 Subject: [PATCH 195/369] mm/damon/tests/core-kunit: add a test for damon_commit_ctx() Patch series "mm/damon: test and document power-of-2 min_region_sz requirement". Since commit c80f46ac228b ("mm/damon/core: disallow non-power of two min_region_sz"), min_region_sz is always restricted to be a power of two. Add a kunit test to confirm the functionality. Also, the change adds a restriction to addr_unit parameter. Clarify it on the document. This patch (of 2): Add a kunit test for confirming the change that is made on commit c80f46ac228b ("mm/damon/core: disallow non-power of two min_region_sz") functions as expected. Link: https://lkml.kernel.org/r/20260307194222.202075-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: SeongJae Park Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index fcc1336b234c..2289f9e4610c 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -1057,6 +1057,27 @@ static void damon_test_commit_target_regions(struct kunit *test) (unsigned long[][2]) {{3, 8}, {8, 10}}, 2); } +static void damon_test_commit_ctx(struct kunit *test) +{ + struct damon_ctx *src, *dst; + + src = damon_new_ctx(); + if (!src) + kunit_skip(test, "src alloc fail"); + dst = damon_new_ctx(); + if (!dst) { + damon_destroy_ctx(src); + kunit_skip(test, "dst alloc fail"); + } + /* Only power of two min_region_sz is allowed. */ + src->min_region_sz = 4096; + KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), 0); + src->min_region_sz = 4095; + KUNIT_EXPECT_EQ(test, damon_commit_ctx(dst, src), -EINVAL); + damon_destroy_ctx(src); + damon_destroy_ctx(dst); +} + static void damos_test_filter_out(struct kunit *test) { struct damon_target *t; @@ -1313,6 +1334,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_commit_pageout), KUNIT_CASE(damos_test_commit_migrate_hot), KUNIT_CASE(damon_test_commit_target_regions), + KUNIT_CASE(damon_test_commit_ctx), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), From bfb1523cde1bf925822ec8783e45055a0c599860 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:42:21 -0800 Subject: [PATCH 196/369] Docs/mm/damon/design: document the power-of-two limitation for addr_unit The min_region_sz is set as max(DAMON_MIN_REGION_SZ / addr_unit, 1). DAMON_MIN_REGION_SZ is the same to PAGE_SIZE, and addr_unit is what the user can arbitrarily set. Commit c80f46ac228b ("mm/damon/core: disallow non-power of two min_region_sz") made min_region_sz to always be a power of two. Hence, addr_unit should be a power of two when it is smaller than PAGE_SIZE. While 'addr_unit' is a user-exposed parameter, the rule is not documented. This can confuse users. Specifically, if the user sets addr_unit as a value that is smaller than PAGE_SIZE and not a power of two, the setup will explicitly fail. Document the rule on the design document. Usage documents reference the design document for detail, so updating only the design document should suffice. Link: https://lkml.kernel.org/r/20260307194222.202075-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index dd64f5d7f319..ac795f30519c 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -150,6 +150,8 @@ address on the given address space. Support of ``address unit`` parameter is up to each operations set implementation. ``paddr`` is the only operations set implementation that supports the parameter. +If the value is smaller than ``PAGE_SIZE``, only a power of two should be used. + .. _damon_core_logic: Core Logics From 7e6c650fdbdb76bb6db74d3af025b7327707e6a0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:49:12 -0800 Subject: [PATCH 197/369] mm/damon/core: remove damos_set_next_apply_sis() duplicates Patch series "mm/damon/core: make passed_sample_intervals comparisons overflow-safe". DAMON accounts time using its own jiffies-like time counter, namely damon_ctx->passed_sample_intervals. The counter is incremented on each iteration of kdamond_fn() main loop, which sleeps at least one sample interval. Hence the name is like that. DAMON has time-periodic operations including monitoring results aggregation and DAMOS action application. DAMON sets the next time to do each of such operations in the passed_sample_intervals unit. And it does the operation when the counter becomes the same to or larger than the pre-set values, and update the next time for the operation. Note that the operation is done not only when the values exactly match but also when the time is passed, because the values can be updated for online-committed DAMON parameters. The counter is 'unsigned long' type, and the comparison is done using normal comparison operators. It is not safe from overflows. This can cause rare and limited but odd situations. Let's suppose there is an operation that should be executed every 20 sampling intervals, and the passed_sample_intervals value for next execution of the operation is ULONG_MAX - 3. Once the passed_sample_intervals reaches ULONG_MAX - 3, the operation will be executed, and the next time value for doing the operation becomes 17 (ULONG_MAX - 3 + 20), since overflow happens. In the next iteration of the kdamond_fn() main loop, passed_sample_intervals is larger than the next operation time value, so the operation will be executed again. It will continue executing the operation for each iteration, until the passed_sample_intervals also overflows. Note that this will not be common and problematic in the real world. The sampling interval, which takes for each passed_sample_intervals increment, is 5 ms by default. And it is usually [auto-]tuned for hundreds of milliseconds. That means it takes about 248 days or 4,971 days to have the overflow on 32 bit machines when the sampling interval is 5 ms and 100 ms, respectively (1<<32 * sampling_interval_in_seconds / 3600 / 24). On 64 bit machines, the numbers become 2924712086.77536 and 58494241735.5072 years. So the real user impact is negligible. But still this is better to be fixed as long as the fix is simple and efficient. Fix this by simply replacing the overflow-unsafe native comparison operators with the existing overflow-safe time comparison helpers. The first patch only cleans up the next DAMOS action application time setup for consistency and reduced code. The second and the third patches update DAMOS action application time setup and rest, respectively. This patch (of 3): There is a function for damos->next_apply_sis setup. But some places are open-coding it. Consistently use the helper. Link: https://lkml.kernel.org/r/20260307194915.203169-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 66d8e9b1adcf..c8c2e4660b98 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2452,8 +2452,6 @@ static void kdamond_apply_schemes(struct damon_ctx *c) struct damon_target *t; struct damon_region *r; struct damos *s; - unsigned long sample_interval = c->attrs.sample_interval ? - c->attrs.sample_interval : 1; bool has_schemes_to_apply = false; damon_for_each_scheme(s, c) { @@ -2484,9 +2482,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (c->passed_sample_intervals < s->next_apply_sis) continue; damos_walk_complete(c, s); - s->next_apply_sis = c->passed_sample_intervals + - (s->apply_interval_us ? s->apply_interval_us : - c->attrs.aggr_interval) / sample_interval; + damos_set_next_apply_sis(s, c); s->last_applied = NULL; damos_trace_stat(c, s); } @@ -2864,7 +2860,6 @@ static void kdamond_init_ctx(struct damon_ctx *ctx) { unsigned long sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; - unsigned long apply_interval; struct damos *scheme; ctx->passed_sample_intervals = 0; @@ -2875,9 +2870,7 @@ static void kdamond_init_ctx(struct damon_ctx *ctx) ctx->attrs.intervals_goal.aggrs; damon_for_each_scheme(scheme, ctx) { - apply_interval = scheme->apply_interval_us ? - scheme->apply_interval_us : ctx->attrs.aggr_interval; - scheme->next_apply_sis = apply_interval / sample_interval; + damos_set_next_apply_sis(scheme, ctx); damos_set_filters_default_reject(scheme); } } From f05e253637837b603682173ed6bd0bc873dc0496 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:49:13 -0800 Subject: [PATCH 198/369] mm/damon/core: use time_before() for next_apply_sis damon_ctx->passed_sample_intervals and damos->next_apply_sis are unsigned long, and compared via normal comparison operators. It is unsafe from overflow. Use time_before(), which is safe from overflow when correctly used, instead. Link: https://lkml.kernel.org/r/20260307194915.203169-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c8c2e4660b98..ac06465cd9eb 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2097,7 +2097,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - if (c->passed_sample_intervals < s->next_apply_sis) + if (time_before(c->passed_sample_intervals, s->next_apply_sis)) continue; if (!s->wmarks.activated) @@ -2455,7 +2455,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) bool has_schemes_to_apply = false; damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals < s->next_apply_sis) + if (time_before(c->passed_sample_intervals, s->next_apply_sis)) continue; if (!s->wmarks.activated) @@ -2479,7 +2479,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals < s->next_apply_sis) + if (time_before(c->passed_sample_intervals, s->next_apply_sis)) continue; damos_walk_complete(c, s); damos_set_next_apply_sis(s, c); From 23754a36cd1ce888a136c762080c89e4d7b364d9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:49:14 -0800 Subject: [PATCH 199/369] mm/damon/core: use time_after_eq() in kdamond_fn() damon_ctx->passed_sample_intervals and damon_ctx->next_*_sis are unsigned long. Those are compared in kdamond_fn() using normal comparison operators. It is unsafe from overflow. Use time_after_eq(), which is safe from overflows when correctly used, instead. Link: https://lkml.kernel.org/r/20260307194915.203169-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index ac06465cd9eb..0c167bbc9c1c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2921,7 +2921,8 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (ctx->passed_sample_intervals >= next_aggregation_sis) { + if (time_after_eq(ctx->passed_sample_intervals, + next_aggregation_sis)) { kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); @@ -2943,10 +2944,12 @@ static int kdamond_fn(void *data) sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; - if (ctx->passed_sample_intervals >= next_aggregation_sis) { + if (time_after_eq(ctx->passed_sample_intervals, + next_aggregation_sis)) { if (ctx->attrs.intervals_goal.aggrs && - ctx->passed_sample_intervals >= - ctx->next_intervals_tune_sis) { + time_after_eq( + ctx->passed_sample_intervals, + ctx->next_intervals_tune_sis)) { /* * ctx->next_aggregation_sis might be updated * from kdamond_call(). In the case, @@ -2980,7 +2983,8 @@ static int kdamond_fn(void *data) kdamond_split_regions(ctx); } - if (ctx->passed_sample_intervals >= next_ops_update_sis) { + if (time_after_eq(ctx->passed_sample_intervals, + next_ops_update_sis)) { ctx->next_ops_update_sis = next_ops_update_sis + ctx->attrs.ops_update_interval / sample_interval; From 5d6a520aff230c78df202a90429bc6fce2a11791 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:49 -0800 Subject: [PATCH 200/369] mm/damon/core: use mult_frac() Patch series "mm/damon: improve/fixup/update ratio calculation, test and documentation". Yet another batch of misc/minor improvements and fixups. Use mult_frac() instead of the worse open-coding for rate calculations (patch 1). Add a test for a previously found and fixed bug (patch 2). Improve and update comments and documentations for easier code review and up-to-date information (patches 3-6). Finally, fix an obvious typo (patch 7). This patch (of 7): There are multiple places in core code that do open-code rate calculations. Use mult_frac(), which is developed for doing that in a way more safe from overflow and precision loss. Link: https://lkml.kernel.org/r/20260307195356.203753-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260307195356.203753-2-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 0c167bbc9c1c..db44294745e6 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -665,7 +665,7 @@ static unsigned int damon_accesses_bp_to_nr_accesses( static unsigned int damon_nr_accesses_to_accesses_bp( unsigned int nr_accesses, struct damon_attrs *attrs) { - return nr_accesses * 10000 / damon_max_nr_accesses(attrs); + return mult_frac(nr_accesses, 10000, damon_max_nr_accesses(attrs)); } static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, @@ -1724,7 +1724,7 @@ static unsigned long damon_get_intervals_score(struct damon_ctx *c) } target_access_events = max_access_events * goal_bp / 10000; target_access_events = target_access_events ? : 1; - return access_events * 10000 / target_access_events; + return mult_frac(access_events, 10000, target_access_events); } static unsigned long damon_feed_loop_next_input(unsigned long last_input, @@ -2204,7 +2204,7 @@ static __kernel_ulong_t damos_get_node_mem_bp( numerator = i.totalram - i.freeram; else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */ numerator = i.freeram; - return numerator * 10000 / i.totalram; + return mult_frac(numerator, 10000, i.totalram); } static unsigned long damos_get_node_memcg_used_bp( @@ -2237,7 +2237,7 @@ static unsigned long damos_get_node_memcg_used_bp( numerator = used_pages; else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ numerator = i.totalram - used_pages; - return numerator * 10000 / i.totalram; + return mult_frac(numerator, 10000, i.totalram); } #else static __kernel_ulong_t damos_get_node_mem_bp( @@ -2267,8 +2267,8 @@ static unsigned int damos_get_in_active_mem_bp(bool active_ratio) global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); total = active + inactive; if (active_ratio) - return active * 10000 / total; - return inactive * 10000 / total; + return mult_frac(active, 10000, total); + return mult_frac(inactive, 10000, total); } static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) @@ -2311,8 +2311,8 @@ static unsigned long damos_quota_score(struct damos_quota *quota) damos_for_each_quota_goal(goal, quota) { damos_set_quota_goal_current_value(goal); highest_score = max(highest_score, - goal->current_value * 10000 / - goal->target_value); + mult_frac(goal->current_value, 10000, + goal->target_value)); } return highest_score; @@ -2342,8 +2342,8 @@ static void damos_set_effective_quota(struct damos_quota *quota) if (quota->ms) { if (quota->total_charged_ns) - throughput = mult_frac(quota->total_charged_sz, 1000000, - quota->total_charged_ns); + throughput = mult_frac(quota->total_charged_sz, + 1000000, quota->total_charged_ns); else throughput = PAGE_SIZE * 1024; esz = min(throughput * quota->ms, esz); From fd83b0d1c49b7e52bdcd8704df04fdbae0c24194 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:50 -0800 Subject: [PATCH 201/369] mm/damon/tests/core-kunit: add a test for damon_is_last_region() There was a bug [1] in damon_is_last_region(). Add a kunit test to not reintroduce the bug. Link: https://lkml.kernel.org/r/20260307195356.203753-3-sj@kernel.org Link: https://lore.kernel.org/20260114152049.99727-1-sj@kernel.org/ [1] Signed-off-by: SeongJae Park Tested-by: wang lian Reviewed-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 2289f9e4610c..e86d4f4fe261 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -1311,6 +1311,28 @@ static void damon_test_apply_min_nr_regions(struct kunit *test) damon_test_apply_min_nr_regions_for(test, 10, 2, 10, 2, 5); } +static void damon_test_is_last_region(struct kunit *test) +{ + struct damon_region *r; + struct damon_target *t; + int i; + + t = damon_new_target(); + if (!t) + kunit_skip(test, "target alloc fail\n"); + + for (i = 0; i < 4; i++) { + r = damon_new_region(i * 2, (i + 1) * 2); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc %d fail\n", i); + } + damon_add_region(r, t); + KUNIT_EXPECT_TRUE(test, damon_is_last_region(r, t)); + } + damon_free_target(t); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -1339,6 +1361,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), KUNIT_CASE(damon_test_apply_min_nr_regions), + KUNIT_CASE(damon_test_is_last_region), {}, }; From 2a5f4454e00e630a3535b851fbe4f6e5d228cc5a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:51 -0800 Subject: [PATCH 202/369] mm/damon/core: clarify damon_set_attrs() usages damon_set_attrs() is called for multiple purposes from multiple places. Calling it in an unsafe context can make DAMON internal state polluted and results in unexpected behaviors. Clarify when it is safe, and where it is being called. Link: https://lkml.kernel.org/r/20260307195356.203753-4-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index db44294745e6..2d2332f3d377 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -751,8 +751,16 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs) * @ctx: monitoring context * @attrs: monitoring attributes * - * This function should be called while the kdamond is not running, an access - * check results aggregation is not ongoing (e.g., from damon_call(). + * This function updates monitoring results and next monitoring/damos operation + * schedules. Because those are periodically updated by kdamond, this should + * be called from a safe contexts. Such contexts include damon_ctx setup time + * while the kdamond is not yet started, and inside of kdamond_fn(). + * + * In detail, all DAMON API callers directly call this function for initial + * setup of damon_ctx before calling damon_start(). Some of the API callers + * also indirectly call this function via damon_call() -> damon_commit() for + * online parameters updates. Finally, kdamond_fn() itself use this for + * applying auto-tuned monitoring intervals. * * Every time interval is in micro-seconds. * From 3802e1d98e92ca6abdd25446b802f405fef83da0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:52 -0800 Subject: [PATCH 203/369] mm/damon: document non-zero length damon_region assumption DAMON regions are assumed to always be non-zero length. There was a confusion [1] about it, probably due to lack of the documentation. Document it. Link: https://lkml.kernel.org/r/20260307195356.203753-5-sj@kernel.org Link: https://lore.kernel.org/20251231070029.79682-1-sj@kernel.org/ [1] Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 60e6da3012fa..7d0265d02954 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -55,6 +55,8 @@ struct damon_size_range { * @list: List head for siblings. * @age: Age of this region. * + * For any use case, @ar should be non-zero positive size. + * * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be * increased for every &damon_attrs->sample_interval if an access to the region * during the last sampling interval is found. The update of this field should From d7f00084f6863a243b396200c81e83ae302c5a76 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:53 -0800 Subject: [PATCH 204/369] Docs/admin-guide/mm/damn/lru_sort: fix intervals autotune parameter name The section name should be the same as the parameter name. Fix it. Link: https://lkml.kernel.org/r/20260307195356.203753-6-sj@kernel.org Fixes: ed581147a417 ("Docs/admin-guide/mm/damon/lru_sort: document intervals autotuning") Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 20a8378d5a94..73980bacc3a0 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -91,8 +91,8 @@ increases and decreases the effective level of the quota aiming the LRU Disabled by default. -Auto-tune monitoring intervals ------------------------------- +autotune_monitoring_intervals +----------------------------- If this parameter is set as ``Y``, DAMON_LRU_SORT automatically tunes DAMON's sampling and aggregation intervals. The auto-tuning aims to capture meaningful From 20675fc8c02217c8bea5c7a0aedec29e5c4fb426 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:54 -0800 Subject: [PATCH 205/369] Docs/mm/damon/maintainer-profile: use flexible review cadence The document mentions the maitainer is working in the usual 9-5 fashion. The maintainer nowadays prefers working in a more flexible way. Update the document to avoid contributors having a wrong time expectation. Link: https://lkml.kernel.org/r/20260307195356.203753-7-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index 41b1d73b9bd7..bcb9798a27a8 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -63,10 +63,10 @@ management subsystem maintainer. Review cadence -------------- -The DAMON maintainer does the work on the usual work hour (09:00 to 17:00, -Mon-Fri) in PT (Pacific Time). The response to patches will occasionally be -slow. Do not hesitate to send a ping if you have not heard back within a week -of sending a patch. +The DAMON maintainer usually work in a flexible way, except early morning in PT +(Pacific Time). The response to patches will occasionally be slow. Do not +hesitate to send a ping if you have not heard back within a week of sending a +patch. Mailing tool ------------ From a4e82de81fe59d5bfcc9450145e8e108561f2e07 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Mar 2026 11:53:55 -0800 Subject: [PATCH 206/369] Docs/mm/damon/index: fix typo: autoamted -> automated There is an obvious typo. Fix it (s/autoamted/automated/). Link: https://lkml.kernel.org/r/20260307195356.203753-8-sj@kernel.org Fixes: 32d11b320897 ("Docs/mm/damon/index: simplify the intro") Signed-off-by: SeongJae Park Acked-by: wang lian Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 82f6c5eea49a..318f6a7bfea4 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -12,7 +12,7 @@ DAMON is a Linux kernel subsystem for efficient :ref:`data access monitoring - *light-weight* (for production online usages), - *scalable* (in terms of memory size), - *tunable* (for flexible usages), and - - *autoamted* (for production operation without manual tunings). + - *automated* (for production operation without manual tunings). .. toctree:: :maxdepth: 2 From 1eba4c9599fa1de9308d45d289fcb944b4746526 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 11 Mar 2026 00:18:37 +0900 Subject: [PATCH 207/369] docs: mm: fix typo in numa_memory_policy.rst Fix a typo: MPOL_INTERLEAVED -> MPOL_INTERLEAVE. Link: https://lkml.kernel.org/r/20260310151837.5888-1-akinobu.mita@gmail.com Signed-off-by: Akinobu Mita Reviewed-by: Andrew Morton Acked-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/numa_memory_policy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index a70f20ce1ffb..90ab26e805a9 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -217,7 +217,7 @@ MPOL_PREFERRED the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags described below. -MPOL_INTERLEAVED +MPOL_INTERLEAVE This mode specifies that page allocations be interleaved, on a page granularity, across the nodes specified in the policy. This mode also behaves slightly differently, based on the From 341ffe82a7a3a1e0756b58999405b6df0c2b3e8d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:58 +0100 Subject: [PATCH 208/369] mm: move vma_kernel_pagesize() from hugetlb to mm.h Patch series "mm: move vma_(kernel|mmu)_pagesize() out of hugetlb.c", v2. Looking into vma_(kernel|mmu)_pagesize(), I realized that there is one scenario where DAX would not do the right thing when the kernel is not compiled with hugetlb support. Without hugetlb support, vma_(kernel|mmu)_pagesize() will always return PAGE_SIZE instead of using the ->pagesize() result provided by dax-device code. Fix that by moving vma_kernel_pagesize() to core MM code, where it belongs. I don't think this is stable material, but am not 100% sure. Also, move vma_mmu_pagesize() while at it. Remove the unnecessary hugetlb.h inclusion from KVM code. This patch (of 4): In the past, only hugetlb had special "vma_kernel_pagesize()" requirements, so it provided its own implementation. In commit 05ea88608d4e ("mm, hugetlbfs: introduce ->pagesize() to vm_operations_struct") we generalized that approach by providing a vm_ops->pagesize() callback to be used by device-dax. Once device-dax started using that callback in commit c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") it was missed that CONFIG_DEV_DAX does not depend on hugetlb support. So building a kernel with CONFIG_DEV_DAX but without CONFIG_HUGETLBFS would not pick up that value. Fix it by moving vma_kernel_pagesize() to mm.h, providing only a single implementation. While at it, improve the kerneldoc a bit. Ideally, we'd move vma_mmu_pagesize() as well to the header. However, its __weak symbol might be overwritten by a PPC variant in hugetlb code. So let's leave it in there for now, as it really only matters for some hugetlb oddities. This was found by code inspection. Link: https://lkml.kernel.org/r/20260309151901.123947-1-david@kernel.org Link: https://lkml.kernel.org/r/20260309151901.123947-2-david@kernel.org Fixes: c1d53b92b95c ("device-dax: implement ->pagesize() for smaps to report MMUPageSize") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: Dan Williams Cc: "Christophe Leroy (CS GROUP)" Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 20 ++++++++++++++++++++ mm/hugetlb.c | 17 ----------------- 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65910437be1c..44c1848a2c21 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); - extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) @@ -1177,11 +1175,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; diff --git a/include/linux/mm.h b/include/linux/mm.h index c758f4e68727..e62cea754b0e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1351,6 +1351,26 @@ static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) return is_shared_maywrite(&vma->flags); } +/** + * vma_kernel_pagesize - Default page size granularity for this VMA. + * @vma: The user mapping. + * + * The kernel page size specifies in which granularity VMA modifications + * can be performed. Folios in this VMA will be aligned to, and at least + * the size of the number of bytes returned by this function. + * + * The default kernel page size is not affected by Transparent Huge Pages + * being in effect. + * + * Return: The default page size granularity for this VMA. + */ +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + if (unlikely(vma->vm_ops && vma->vm_ops->pagesize)) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; +} + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 992c1632d26a..66761ae5ce71 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1017,23 +1017,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -/** - * vma_kernel_pagesize - Page size granularity for this VMA. - * @vma: The user mapping. - * - * Folios in this VMA will be aligned to, and at least the size of the - * number of bytes returned by this function. - * - * Return: The default size of the folios allocated when backing a VMA. - */ -unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) -{ - if (vma->vm_ops && vma->vm_ops->pagesize) - return vma->vm_ops->pagesize(vma); - return PAGE_SIZE; -} -EXPORT_SYMBOL_GPL(vma_kernel_pagesize); - /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On From a9496e9e4b7c5785e82000a26b1118b4a1fd85c7 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:18:59 +0100 Subject: [PATCH 209/369] mm: move vma_mmu_pagesize() from hugetlb to vma.c vma_mmu_pagesize() is also queried on non-hugetlb VMAs and does not really belong into hugetlb.c. PPC64 provides a custom overwrite with CONFIG_HUGETLB_PAGE, see arch/powerpc/mm/book3s64/slice.c, so we cannot easily make this a static inline function. So let's move it to vma.c and add some proper kerneldoc. To make vma tests happy, add a simple vma_kernel_pagesize() stub in tools/testing/vma/include/custom.h. Link: https://lkml.kernel.org/r/20260309151901.123947-3-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: "Christophe Leroy (CS GROUP)" Cc: Dan Williams Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 2 ++ mm/hugetlb.c | 11 ----------- mm/vma.c | 21 +++++++++++++++++++++ tools/testing/vma/include/custom.h | 5 +++++ 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 44c1848a2c21..aaf3d472e6b5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -777,8 +777,6 @@ static inline unsigned long huge_page_size(const struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } -extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); - static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; @@ -1175,11 +1173,6 @@ static inline unsigned long huge_page_mask(struct hstate *h) return PAGE_MASK; } -static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return PAGE_SIZE; -} - static inline unsigned int huge_page_order(struct hstate *h) { return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index e62cea754b0e..efb8be5d259c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1371,6 +1371,8 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return PAGE_SIZE; } +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 66761ae5ce71..a786034ac95c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1017,17 +1017,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -/* - * Return the page size being used by the MMU to back a VMA. In the majority - * of cases, the page size used by the kernel matches the MMU size. On - * architectures where it differs, an architecture-specific 'strong' - * version of this symbol is required. - */ -__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) -{ - return vma_kernel_pagesize(vma); -} - /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to diff --git a/mm/vma.c b/mm/vma.c index be64f781a3aa..e95fd5a5fe5c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -3300,3 +3300,24 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) return 0; } + +/** + * vma_mmu_pagesize - Default MMU page size granularity for this VMA. + * @vma: The user mapping. + * + * In the common case, the default page size used by the MMU matches the + * default page size used by the kernel (see vma_kernel_pagesize()). On + * architectures where it differs, an architecture-specific 'strong' version + * of this symbol is required. + * + * The default MMU page size is not affected by Transparent Huge Pages + * being in effect, or any usage of larger MMU page sizes (either through + * architectural huge-page mappings or other explicit/implicit coalescing of + * virtual ranges performed by the MMU). + * + * Return: The default MMU page size granularity for this VMA. + */ +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 833ff4d7f799..7150e09122b2 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -118,3 +118,8 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, vma_flags_set_flag(&flags, bits[i]); return flags; } + +static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + return PAGE_SIZE; +} From e8301b6adcc42132225b9f8a58f587c0f6d962cf Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:19:00 +0100 Subject: [PATCH 210/369] KVM: remove hugetlb.h inclusion hugetlb.h is no longer required now that we moved vma_kernel_pagesize() to mm.h. Link: https://lkml.kernel.org/r/20260309151901.123947-4-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Cc: "Christophe Leroy (CS GROUP)" Cc: Dan Williams Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- virt/kvm/kvm_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9093251beb39..d0ab29672c71 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include From 396042fb2b834a8fbcea9c850dbbd4ae7c7b75a9 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 9 Mar 2026 16:19:01 +0100 Subject: [PATCH 211/369] KVM: PPC: remove hugetlb.h inclusion hugetlb.h is no longer required now that we moved vma_kernel_pagesize() to mm.h. Link: https://lkml.kernel.org/r/20260309151901.123947-5-david@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Ritesh Harjani Cc: "Christophe Leroy (CS GROUP)" Cc: Dan Williams Cc: Jann Horn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 08e5816fdd61..61dbeea317f3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include From 91e74fa8b1bc1e44612cb677a710edce2061b6a7 Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Mon, 9 Mar 2026 12:34:06 +0000 Subject: [PATCH 212/369] kho: make sure preservations do not span multiple NUMA nodes The KHO restoration machinery is not capable of dealing with preservations that span multiple NUMA nodes. kho_preserve_folio() guarantees the preservation will only span one NUMA node since folios can't span multiple nodes. This leaves kho_preserve_pages(). While semantically kho_preserve_pages() only deals with 0-order pages, so all preservations should be single page only, in practice it combines preservations to higher orders for efficiency. This can result in a preservation spanning multiple nodes. Break up the preservations into a smaller order if that happens. Link: https://lkml.kernel.org/r/20260309123410.382308-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Suggested-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 747a35107c84..3586490f7487 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -870,9 +870,17 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) } while (pfn < end_pfn) { - const unsigned int order = + unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + /* + * Make sure all the pages in a single preservation are in the + * same NUMA node. The restore machinery can not cope with a + * preservation spanning multiple NUMA nodes. + */ + while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1)) + order--; + err = kho_radix_add_page(tree, pfn, order); if (err) { failed_pfn = pfn; From 22bdab8e98b7039e4cebd954bcf0c809422ebc0d Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 9 Mar 2026 12:34:07 +0000 Subject: [PATCH 213/369] kho: drop restriction on maximum page order KHO currently restricts the maximum order of a restored page to the maximum order supported by the buddy allocator. While this works fine for much of the data passed across kexec, it is possible to have pages larger than MAX_PAGE_ORDER. For one, it is possible to get a larger order when using kho_preserve_pages() if the number of pages is large enough, since it tries to combine multiple aligned 0-order preservations into one higher order preservation. For another, upcoming support for hugepages can have gigantic hugepages being preserved over KHO. There is no real reason for this limit. The KHO preservation machinery can handle any page order. Remove this artificial restriction on max page order. Link: https://lkml.kernel.org/r/20260309123410.382308-2-pratyush@kernel.org Signed-off-by: Pratyush Yadav Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Pasha Tatashin Cc: Samiullah Khawaja Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 3586490f7487..532f455c5d4f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -411,7 +411,7 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) * check also implicitly makes sure phys is order-aligned since for * non-order-aligned phys addresses, magic will never be set. */ - if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) + if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC)) return NULL; nr_pages = (1 << info.order); From 241f9005b1c81c2637eef2c836a03c83b4f3eeb9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Mar 2026 17:42:44 +0900 Subject: [PATCH 214/369] zram: do not permit params change after init Patch series "zram: recompression cleanups and tweaks", v2. This series is a somewhat random mix of fixups, recompression cleanups and improvements partly based on internal conversations. A few patches in the series remove unexpected or confusing behaviour, e.g. auto correction of bad priority= param for recompression, which should have always been just an error. Then it also removes "chain recompression" which has a tricky, unexpected and confusing behaviour at times. We also unify and harden the handling of algo/priority params. There is also an addition of missing device lock in algorithm_params_store() which previously permitted modification of algo params while the device is active. This patch (of 6): First, algorithm_params_store(), like any sysfs handler, should grab device lock. Second, like any write() sysfs handler, it should grab device lock in exclusive mode. Third, it should not permit change of algos' parameters after device init, as this doesn't make sense - we cannot compress with one C/D dict and then just change C/D dict to a different one, for example. Another thing to notice is that algorithm_params_store() accesses device's ->comp_algs for algo priority lookup, which should be protected by device lock in exclusive mode in general. Link: https://lkml.kernel.org/r/20260311084312.1766036-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20260311084312.1766036-2-senozhatsky@chromium.org Fixes: 4eac932103a5 ("zram: introduce algorithm_params device attribute") Signed-off-by: Sergey Senozhatsky Acked-by: Brian Geffon Cc: gao xu Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 990d391847f4..bc5777b245d4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1734,6 +1734,10 @@ static ssize_t algorithm_params_store(struct device *dev, } } + guard(rwsem_write)(&zram->dev_lock); + if (init_done(zram)) + return -EBUSY; + /* Lookup priority by algorithm name */ if (algo) { s32 p; From ed19b9d5504f3f7adb68ad8d8db96c390c8570e5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Mar 2026 17:42:45 +0900 Subject: [PATCH 215/369] zram: do not autocorrect bad recompression parameters Do not silently autocorrect bad recompression priority parameter value and just error out. Link: https://lkml.kernel.org/r/20260311084312.1766036-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Minchan Kim Cc: Brian Geffon Cc: gao xu Cc: Jens Axboe Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bc5777b245d4..71c4e2d350ce 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2517,19 +2517,16 @@ static ssize_t recompress_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS; struct zram *zram = dev_to_zram(dev); char *args, *param, *val, *algo = NULL; u64 num_recomp_pages = ULLONG_MAX; struct zram_pp_ctl *ctl = NULL; struct zram_pp_slot *pps; u32 mode = 0, threshold = 0; - u32 prio, prio_max; struct page *page = NULL; ssize_t ret; - prio = ZRAM_SECONDARY_COMP; - prio_max = zram->num_active_comps; - args = skip_spaces(buf); while (*args) { args = next_arg(args, ¶m, &val); @@ -2579,10 +2576,7 @@ static ssize_t recompress_store(struct device *dev, if (ret) return ret; - if (prio == ZRAM_PRIMARY_COMP) - prio = ZRAM_SECONDARY_COMP; - - prio_max = prio + 1; + prio_max = min(prio + 1, ZRAM_MAX_COMPS); continue; } } @@ -2602,7 +2596,7 @@ static ssize_t recompress_store(struct device *dev, continue; if (!strcmp(zram->comp_algs[prio], algo)) { - prio_max = prio + 1; + prio_max = min(prio + 1, ZRAM_MAX_COMPS); found = true; break; } @@ -2620,6 +2614,11 @@ static ssize_t recompress_store(struct device *dev, goto out; } + if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS) { + ret = -EINVAL; + goto out; + } + page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; From 5004a27edba5987bd75fe84c40b1b486ffae8f99 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Mar 2026 17:42:46 +0900 Subject: [PATCH 216/369] zram: drop ->num_active_comps It's not entirely correct to use ->num_active_comps for max-prio limit, as ->num_active_comps just tells the number of configured algorithms, not the max configured priority. For instance, in the following theoretical example: [lz4] [nil] [nil] [deflate] ->num_active_comps is 2, while the actual max-prio is 3. Drop ->num_active_comps and use ZRAM_MAX_COMPS instead. Link: https://lkml.kernel.org/r/20260311084312.1766036-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Minchan Kim Cc: Brian Geffon Cc: gao xu Cc: Jens Axboe Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 29 ++++++++++++++++------------- drivers/block/zram/zram_drv.h | 1 - 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 71c4e2d350ce..74c522c14c78 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2324,6 +2324,18 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, #define RECOMPRESS_IDLE (1 << 0) #define RECOMPRESS_HUGE (1 << 1) +static bool highest_priority_algorithm(struct zram *zram, u32 prio) +{ + u32 p; + + for (p = prio + 1; p < ZRAM_MAX_COMPS; p++) { + if (zram->comp_algs[p]) + return false; + } + + return true; +} + static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio_max, struct zram_pp_ctl *ctl) { @@ -2471,12 +2483,11 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, * Secondary algorithms failed to re-compress the page * in a way that would save memory. * - * Mark the object incompressible if the max-priority - * algorithm couldn't re-compress it. + * Mark the object incompressible if the max-priority (the + * last configured one) algorithm couldn't re-compress it. */ - if (prio < zram->num_active_comps) - return 0; - set_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE); + if (highest_priority_algorithm(zram, prio)) + set_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE); return 0; } @@ -2608,12 +2619,6 @@ static ssize_t recompress_store(struct device *dev, } } - prio_max = min(prio_max, (u32)zram->num_active_comps); - if (prio >= prio_max) { - ret = -EINVAL; - goto out; - } - if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS) { ret = -EINVAL; goto out; @@ -2826,7 +2831,6 @@ static void zram_destroy_comps(struct zram *zram) if (!comp) continue; zcomp_destroy(comp); - zram->num_active_comps--; } for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) @@ -2891,7 +2895,6 @@ static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, } zram->comps[prio] = comp; - zram->num_active_comps++; } zram->disksize = disksize; set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index f0de8f8218f5..08d1774c15db 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -125,7 +125,6 @@ struct zram { */ u64 disksize; /* bytes */ const char *comp_algs[ZRAM_MAX_COMPS]; - s8 num_active_comps; /* * zram is claimed so open request will be failed */ From be5f13d94846e8207c9aac0180b13ce1f762e1b1 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Mar 2026 17:42:47 +0900 Subject: [PATCH 217/369] zram: update recompression documentation Emphasize usage of the `priority` parameter for recompression and explain why `algo` parameter can lead to unexpected behavior and thus is not recommended. Link: https://lkml.kernel.org/r/20260311084312.1766036-5-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: gao xu Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 40 ++++++++++----------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 451fa00d3004..967b58c3aad2 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -462,7 +462,7 @@ know it via /sys/block/zram0/bd_stat's 3rd column. recompression ------------- -With CONFIG_ZRAM_MULTI_COMP, zram can recompress pages using alternative +With `CONFIG_ZRAM_MULTI_COMP`, zram can recompress pages using alternative (secondary) compression algorithms. The basic idea is that alternative compression algorithm can provide better compression ratio at a price of (potentially) slower compression/decompression speeds. Alternative compression @@ -471,7 +471,7 @@ that default algorithm failed to compress). Another application is idle pages recompression - pages that are cold and sit in the memory can be recompressed using more effective algorithm and, hence, reduce zsmalloc memory usage. -With CONFIG_ZRAM_MULTI_COMP, zram supports up to 4 compression algorithms: +With `CONFIG_ZRAM_MULTI_COMP`, zram supports up to 4 compression algorithms: one primary and up to 3 secondary ones. Primary zram compressor is explained in "3) Select compression algorithm", secondary algorithms are configured using recomp_algorithm device attribute. @@ -495,34 +495,43 @@ configuration::: #select deflate recompression algorithm, priority 2 echo "algo=deflate priority=2" > /sys/block/zramX/recomp_algorithm -Another device attribute that CONFIG_ZRAM_MULTI_COMP enables is recompress, +Another device attribute that `CONFIG_ZRAM_MULTI_COMP` enables is `recompress`, which controls recompression. Examples::: #IDLE pages recompression is activated by `idle` mode - echo "type=idle" > /sys/block/zramX/recompress + echo "type=idle priority=1" > /sys/block/zramX/recompress #HUGE pages recompression is activated by `huge` mode - echo "type=huge" > /sys/block/zram0/recompress + echo "type=huge priority=2" > /sys/block/zram0/recompress #HUGE_IDLE pages recompression is activated by `huge_idle` mode - echo "type=huge_idle" > /sys/block/zramX/recompress + echo "type=huge_idle priority=1" > /sys/block/zramX/recompress The number of idle pages can be significant, so user-space can pass a size threshold (in bytes) to the recompress knob: zram will recompress only pages of equal or greater size::: #recompress all pages larger than 3000 bytes - echo "threshold=3000" > /sys/block/zramX/recompress + echo "threshold=3000 priority=1" > /sys/block/zramX/recompress #recompress idle pages larger than 2000 bytes - echo "type=idle threshold=2000" > /sys/block/zramX/recompress + echo "type=idle threshold=2000 priority=1" > \ + /sys/block/zramX/recompress It is also possible to limit the number of pages zram re-compression will attempt to recompress::: - echo "type=huge_idle max_pages=42" > /sys/block/zramX/recompress + echo "type=huge_idle priority=1 max_pages=42" > \ + /sys/block/zramX/recompress + +It is advised to always specify `priority` parameter. While it is also +possible to specify `algo` parameter, so that `zram` will use algorithm's +name to determine the priority, it is not recommended, since it can lead to +unexpected results when the same algorithm is configured with different +priorities (e.g. different parameters). `priority` is the only way to +guarantee that the expected algorithm will be used. During re-compression for every page, that matches re-compression criteria, ZRAM iterates the list of registered alternative compression algorithms in @@ -533,19 +542,6 @@ no secondary algorithms left to try. If none of the secondary algorithms can successfully re-compressed the page such a page is marked as incompressible, so ZRAM will not attempt to re-compress it in the future. -This re-compression behaviour, when it iterates through the list of -registered compression algorithms, increases our chances of finding the -algorithm that successfully compresses a particular page. Sometimes, however, -it is convenient (and sometimes even necessary) to limit recompression to -only one particular algorithm so that it will not try any other algorithms. -This can be achieved by providing a `algo` or `priority` parameter::: - - #use zstd algorithm only (if registered) - echo "type=huge algo=zstd" > /sys/block/zramX/recompress - - #use zstd algorithm only (if zstd was registered under priority 1) - echo "type=huge priority=1" > /sys/block/zramX/recompress - memory tracking =============== From cedfa028b54e584532026888dec94039b62b3d1f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Mar 2026 17:42:48 +0900 Subject: [PATCH 218/369] zram: remove chained recompression Chained recompression has unpredictable behavior and is not useful in practice. First, systems usually configure just one alternative recompression algorithm, which has slower compression/decompression but better compression ratio. A single alternative algorithm doesn't need chaining. Second, even with multiple recompression algorithms, chained recompression is suboptimal. If a lower priority algorithm succeeds, the page is never attempted with a higher priority algorithm, leading to worse memory savings. If a lower priority algorithm fails, the page is still attempted with a higher priority algorithm, wasting resources on the failed lower priority attempt. In either case, the system would be better off targeting a specific priority directly. Chained recompression also significantly complicates the code. Remove it. Link: https://lkml.kernel.org/r/20260311084312.1766036-6-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Brian Geffon Cc: gao xu Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 9 --- drivers/block/zram/zram_drv.c | 84 ++++++--------------- 2 files changed, 24 insertions(+), 69 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 967b58c3aad2..60b07a7e30cd 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -533,15 +533,6 @@ unexpected results when the same algorithm is configured with different priorities (e.g. different parameters). `priority` is the only way to guarantee that the expected algorithm will be used. -During re-compression for every page, that matches re-compression criteria, -ZRAM iterates the list of registered alternative compression algorithms in -order of their priorities. ZRAM stops either when re-compression was -successful (re-compressed object is smaller in size than the original one) -and matches re-compression criteria (e.g. size threshold) or when there are -no secondary algorithms left to try. If none of the secondary algorithms can -successfully re-compressed the page such a page is marked as incompressible, -so ZRAM will not attempt to re-compress it in the future. - memory tracking =============== diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 74c522c14c78..48fb6e52de42 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2336,7 +2336,7 @@ static bool highest_priority_algorithm(struct zram *zram, u32 prio) return true; } -static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio_max, +static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio, struct zram_pp_ctl *ctl) { unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; @@ -2362,8 +2362,8 @@ static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio_max, test_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE)) goto next; - /* Already compressed with same of higher priority */ - if (get_slot_comp_priority(zram, index) + 1 >= prio_max) + /* Already compressed with same or higher priority */ + if (get_slot_comp_priority(zram, index) >= prio) goto next; ok = place_pp_slot(zram, ctl, index); @@ -2384,8 +2384,7 @@ next: * Corresponding ZRAM slot should be locked. */ static int recompress_slot(struct zram *zram, u32 index, struct page *page, - u64 *num_recomp_pages, u32 threshold, u32 prio, - u32 prio_max) + u64 *num_recomp_pages, u32 threshold, u32 prio) { struct zcomp_strm *zstrm = NULL; unsigned long handle_old; @@ -2397,6 +2396,9 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, void *src; int ret = 0; + if (!zram->comps[prio]) + return -EINVAL; + handle_old = get_slot_handle(zram, index); if (!handle_old) return -EINVAL; @@ -2419,51 +2421,10 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, */ clear_slot_flag(zram, index, ZRAM_IDLE); - class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); - - prio = max(prio, get_slot_comp_priority(zram, index) + 1); - /* - * Recompression slots scan should not select slots that are - * already compressed with a higher priority algorithm, but - * just in case - */ - if (prio >= prio_max) - return 0; - - /* - * Iterate the secondary comp algorithms list (in order of priority) - * and try to recompress the page. - */ - for (; prio < prio_max; prio++) { - if (!zram->comps[prio]) - continue; - - zstrm = zcomp_stream_get(zram->comps[prio]); - src = kmap_local_page(page); - ret = zcomp_compress(zram->comps[prio], zstrm, - src, &comp_len_new); - kunmap_local(src); - - if (ret) { - zcomp_stream_put(zstrm); - zstrm = NULL; - break; - } - - class_index_new = zs_lookup_class_index(zram->mem_pool, - comp_len_new); - - /* Continue until we make progress */ - if (class_index_new >= class_index_old || - (threshold && comp_len_new >= threshold)) { - zcomp_stream_put(zstrm); - zstrm = NULL; - continue; - } - - /* Recompression was successful so break out */ - break; - } + zstrm = zcomp_stream_get(zram->comps[prio]); + src = kmap_local_page(page); + ret = zcomp_compress(zram->comps[prio], zstrm, src, &comp_len_new); + kunmap_local(src); /* * Decrement the limit (if set) on pages we can recompress, even @@ -2474,11 +2435,18 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, if (*num_recomp_pages) *num_recomp_pages -= 1; - /* Compression error */ - if (ret) + if (ret) { + zcomp_stream_put(zstrm); return ret; + } + + class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); + class_index_new = zs_lookup_class_index(zram->mem_pool, comp_len_new); + + if (class_index_new >= class_index_old || + (threshold && comp_len_new >= threshold)) { + zcomp_stream_put(zstrm); - if (!zstrm) { /* * Secondary algorithms failed to re-compress the page * in a way that would save memory. @@ -2528,11 +2496,11 @@ static ssize_t recompress_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS; struct zram *zram = dev_to_zram(dev); char *args, *param, *val, *algo = NULL; u64 num_recomp_pages = ULLONG_MAX; struct zram_pp_ctl *ctl = NULL; + u32 prio = ZRAM_SECONDARY_COMP; struct zram_pp_slot *pps; u32 mode = 0, threshold = 0; struct page *page = NULL; @@ -2586,8 +2554,6 @@ static ssize_t recompress_store(struct device *dev, ret = kstrtouint(val, 10, &prio); if (ret) return ret; - - prio_max = min(prio + 1, ZRAM_MAX_COMPS); continue; } } @@ -2607,7 +2573,6 @@ static ssize_t recompress_store(struct device *dev, continue; if (!strcmp(zram->comp_algs[prio], algo)) { - prio_max = min(prio + 1, ZRAM_MAX_COMPS); found = true; break; } @@ -2636,7 +2601,7 @@ static ssize_t recompress_store(struct device *dev, goto out; } - scan_slots_for_recompress(zram, mode, prio_max, ctl); + scan_slots_for_recompress(zram, mode, prio, ctl); ret = len; while ((pps = select_pp_slot(ctl))) { @@ -2650,8 +2615,7 @@ static ssize_t recompress_store(struct device *dev, goto next; err = recompress_slot(zram, pps->index, page, - &num_recomp_pages, threshold, - prio, prio_max); + &num_recomp_pages, threshold, prio); next: slot_unlock(zram, pps->index); release_pp_slot(zram, pps); From 301f3922009658ee353b3177bc186a12d36b8dd3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Mar 2026 17:42:49 +0900 Subject: [PATCH 219/369] zram: unify and harden algo/priority params handling We have two functions that accept algo= and priority= params - algorithm_params_store() and recompress_store(). This patch unifies and hardens handling of those parameters. There are 4 possible cases: - only priority= provided [recommended] We need to verify that provided priority value is within permitted range for each particular function. - both algo= and priority= provided We cannot prioritize one over another. All we should do is to verify that zram is configured in the way that user-space expects it to be. Namely that zram indeed has compressor algo= setup at given priority=. - only algo= provided [not recommended] We should lookup priority in compressors list. - none provided [not recommended] Just use function's defaults. Link: https://lkml.kernel.org/r/20260311084312.1766036-7-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Minchan Kim Cc: Brian Geffon Cc: gao xu Cc: Jens Axboe Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 110 +++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 42 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 48fb6e52de42..bab21b44bdcb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1619,6 +1619,37 @@ static void zram_debugfs_register(struct zram *zram) {}; static void zram_debugfs_unregister(struct zram *zram) {}; #endif +/* Only algo parameter given, lookup by algo name */ +static int lookup_algo_priority(struct zram *zram, const char *algo, + u32 min_prio) +{ + s32 prio; + + for (prio = min_prio; prio < ZRAM_MAX_COMPS; prio++) { + if (!zram->comp_algs[prio]) + continue; + + if (!strcmp(zram->comp_algs[prio], algo)) + return prio; + } + + return -EINVAL; +} + +/* Both algo and priority parameters given, validate them */ +static int validate_algo_priority(struct zram *zram, const char *algo, u32 prio) +{ + if (prio >= ZRAM_MAX_COMPS) + return -EINVAL; + /* No algo at given priority */ + if (!zram->comp_algs[prio]) + return -EINVAL; + /* A different algo at given priority */ + if (strcmp(zram->comp_algs[prio], algo)) + return -EINVAL; + return 0; +} + static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) { zram->comp_algs[prio] = alg; @@ -1691,6 +1722,7 @@ static ssize_t algorithm_params_store(struct device *dev, char *args, *param, *val, *algo = NULL, *dict_path = NULL; struct deflate_params deflate_params; struct zram *zram = dev_to_zram(dev); + bool prio_param = false; int ret; deflate_params.winbits = ZCOMP_PARAM_NOT_SET; @@ -1703,6 +1735,7 @@ static ssize_t algorithm_params_store(struct device *dev, return -EINVAL; if (!strcmp(param, "priority")) { + prio_param = true; ret = kstrtoint(val, 10, &prio); if (ret) return ret; @@ -1738,24 +1771,22 @@ static ssize_t algorithm_params_store(struct device *dev, if (init_done(zram)) return -EBUSY; - /* Lookup priority by algorithm name */ - if (algo) { - s32 p; - - prio = -EINVAL; - for (p = ZRAM_PRIMARY_COMP; p < ZRAM_MAX_COMPS; p++) { - if (!zram->comp_algs[p]) - continue; - - if (!strcmp(zram->comp_algs[p], algo)) { - prio = p; - break; - } - } + if (prio_param) { + if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS) + return -EINVAL; } - if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS) - return -EINVAL; + if (algo && prio_param) { + ret = validate_algo_priority(zram, algo, prio); + if (ret) + return ret; + } + + if (algo && !prio_param) { + prio = lookup_algo_priority(zram, algo, ZRAM_PRIMARY_COMP); + if (prio < 0) + return -EINVAL; + } ret = comp_params_store(zram, prio, level, dict_path, &deflate_params); return ret ? ret : len; @@ -2396,9 +2427,6 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, void *src; int ret = 0; - if (!zram->comps[prio]) - return -EINVAL; - handle_old = get_slot_handle(zram, index); if (!handle_old) return -EINVAL; @@ -2500,10 +2528,11 @@ static ssize_t recompress_store(struct device *dev, char *args, *param, *val, *algo = NULL; u64 num_recomp_pages = ULLONG_MAX; struct zram_pp_ctl *ctl = NULL; - u32 prio = ZRAM_SECONDARY_COMP; - struct zram_pp_slot *pps; + s32 prio = ZRAM_SECONDARY_COMP; u32 mode = 0, threshold = 0; + struct zram_pp_slot *pps; struct page *page = NULL; + bool prio_param = false; ssize_t ret; args = skip_spaces(buf); @@ -2551,7 +2580,8 @@ static ssize_t recompress_store(struct device *dev, } if (!strcmp(param, "priority")) { - ret = kstrtouint(val, 10, &prio); + prio_param = true; + ret = kstrtoint(val, 10, &prio); if (ret) return ret; continue; @@ -2565,30 +2595,26 @@ static ssize_t recompress_store(struct device *dev, if (!init_done(zram)) return -EINVAL; - if (algo) { - bool found = false; - - for (; prio < ZRAM_MAX_COMPS; prio++) { - if (!zram->comp_algs[prio]) - continue; - - if (!strcmp(zram->comp_algs[prio], algo)) { - found = true; - break; - } - } - - if (!found) { - ret = -EINVAL; - goto out; - } + if (prio_param) { + if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS) + return -EINVAL; } - if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS) { - ret = -EINVAL; - goto out; + if (algo && prio_param) { + ret = validate_algo_priority(zram, algo, prio); + if (ret) + return ret; } + if (algo && !prio_param) { + prio = lookup_algo_priority(zram, algo, ZRAM_SECONDARY_COMP); + if (prio < 0) + return -EINVAL; + } + + if (!zram->comps[prio]) + return -EINVAL; + page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; From d239462787b072c78eb19fc1f155c3d411256282 Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 10 Mar 2026 08:58:20 -0700 Subject: [PATCH 220/369] mm: prevent droppable mappings from being locked Droppable mappings must not be lockable. There is a check for VMAs with VM_DROPPABLE set in mlock_fixup() along with checks for other types of unlockable VMAs which ensures this when calling mlock()/mlock2(). For mlockall(MCL_FUTURE), the check for unlockable VMAs is different. In apply_mlockall_flags(), if the flags parameter has MCL_FUTURE set, the current task's mm's default VMA flag field mm->def_flags has VM_LOCKED applied to it. VM_LOCKONFAULT is also applied if MCL_ONFAULT is also set. When these flags are set as default in this manner they are cleared in __mmap_complete() for new mappings that do not support mlock. A check for VM_DROPPABLE in __mmap_complete() is missing resulting in droppable mappings created with VM_LOCKED set. To fix this and reduce that chance of similar bugs in the future, introduce and use vma_supports_mlock(). Link: https://lkml.kernel.org/r/20260310155821.17869-1-anthony.yznaga@oracle.com Fixes: 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") Signed-off-by: Anthony Yznaga Suggested-by: David Hildenbrand Acked-by: David Hildenbrand (Arm) Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes (Oracle) Tested-by: Lorenzo Stoakes (Oracle) Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb_inline.h | 2 +- mm/internal.h | 10 ++++++++++ mm/mlock.c | 10 ++++++---- mm/vma.c | 4 +--- tools/testing/vma/include/stubs.h | 5 +++++ 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 84afc3c3e2e4..565b473fd135 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -30,7 +30,7 @@ static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) #endif -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_page(const struct vm_area_struct *vma) { return is_vm_hugetlb_flags(vma->vm_flags); } diff --git a/mm/internal.h b/mm/internal.h index 4ab833b8bcdf..ebb68ad10d5c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1243,6 +1243,16 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } return fpin; } + +static inline bool vma_supports_mlock(const struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SPECIAL | VM_DROPPABLE)) + return false; + if (vma_is_dax(vma) || is_vm_hugetlb_page(vma)) + return false; + return vma != get_gate_vma(current->mm); +} + #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } diff --git a/mm/mlock.c b/mm/mlock.c index 1a92d16f3684..fd648138bc72 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -472,10 +472,12 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, int ret = 0; vm_flags_t oldflags = vma->vm_flags; - if (newflags == oldflags || (oldflags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) - /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + if (newflags == oldflags || vma_is_secretmem(vma) || + !vma_supports_mlock(vma)) + /* + * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count. + * For secretmem, don't allow the memory to be unlocked. + */ goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); diff --git a/mm/vma.c b/mm/vma.c index e95fd5a5fe5c..b7055c264b5d 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2589,9 +2589,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(mm)) + if (!vma_supports_mlock(vma)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += map->pglen; diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 947a3a0c2566..416bb93f5005 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -426,3 +426,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, } static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} + +static inline bool vma_supports_mlock(const struct vm_area_struct *vma) +{ + return false; +} From 86e69c020b62ee109e10db0ae53ba97f3465df8e Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Tue, 10 Mar 2026 09:56:57 +0800 Subject: [PATCH 221/369] mm/swap: strengthen locking assertions and invariants in cluster allocation swap_cluster_alloc_table() requires several locks to be held by its callers: ci->lock, the per-CPU swap_cluster lock, and, for non-solid-state devices (non-SWP_SOLIDSTATE), the si->global_cluster_lock. While most call paths (e.g., via cluster_alloc_swap_entry() or alloc_swap_scan_list()) correctly acquire these locks before invocation, the path through swap_reclaim_work() -> swap_reclaim_full_clusters() -> isolate_lock_cluster() is distinct. This path operates exclusively on si->full_clusters, where the swap allocation tables are guaranteed to be already allocated. Consequently, isolate_lock_cluster() should never trigger a call to swap_cluster_alloc_table() for these clusters. Strengthen the locking and state assertions to formalize these invariants: 1. Add a lockdep_assert_held() for si->global_cluster_lock in swap_cluster_alloc_table() for non-SWP_SOLIDSTATE devices. 2. Reorder existing lockdep assertions in swap_cluster_alloc_table() to match the actual lock acquisition order (per-CPU lock, then global lock, then cluster lock). 3. Add a VM_WARN_ON_ONCE() in isolate_lock_cluster() to ensure that table allocations are only attempted for clusters being isolated from the free list. Attempting to allocate a table for a cluster from other lists (like the full list during reclaim) indicates a violation of subsystem invariants. These changes ensure locking consistency and help catch potential synchronization or logic issues during development. [zhuhui@kylinos.cn: remove redundant comment, per Barry] Link: https://lkml.kernel.org/r/20260311022241.177801-1-hui.zhu@linux.dev [zhuhui@kylinos.cn: initialize `flags', per Chris] Link: https://lkml.kernel.org/r/20260312023024.903143-1-hui.zhu@linux.dev Link: https://lkml.kernel.org/r/20260310015657.42395-1-hui.zhu@linux.dev Signed-off-by: Hui Zhu Reviewed-by: Youngjun Park Reviewed-by: Barry Song Acked-by: Chris Li Cc: Baoquan He Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 915bc93964db..71a7d6959f3e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -498,8 +498,10 @@ swap_cluster_alloc_table(struct swap_info_struct *si, * Only cluster isolation from the allocator does table allocation. * Swap allocator uses percpu clusters and holds the local lock. */ - lockdep_assert_held(&ci->lock); lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); + if (!(si->flags & SWP_SOLIDSTATE)) + lockdep_assert_held(&si->global_cluster_lock); + lockdep_assert_held(&ci->lock); /* The cluster must be free and was just isolated from the free list. */ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); @@ -600,6 +602,7 @@ static struct swap_cluster_info *isolate_lock_cluster( struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *found = NULL; + u8 flags = CLUSTER_FLAG_NONE; spin_lock(&si->lock); list_for_each_entry(ci, list, list) { @@ -612,6 +615,7 @@ static struct swap_cluster_info *isolate_lock_cluster( ci->flags != CLUSTER_FLAG_FULL); list_del(&ci->list); + flags = ci->flags; ci->flags = CLUSTER_FLAG_NONE; found = ci; break; @@ -620,6 +624,7 @@ static struct swap_cluster_info *isolate_lock_cluster( if (found && !cluster_table_is_alloced(found)) { /* Only an empty free cluster's swap table can be freed. */ + VM_WARN_ON_ONCE(flags != CLUSTER_FLAG_FREE); VM_WARN_ON_ONCE(list != &si->free_clusters); VM_WARN_ON_ONCE(!cluster_is_empty(found)); return swap_cluster_alloc_table(si, found); From 8719c59c4b928fc9ad8d8f45ecbdf859660c904c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:17 -0700 Subject: [PATCH 222/369] mm/damon/core: introduce damos_quota_goal_tuner Patch series "mm/damon: support multiple goal-based quota tuning algorithms". Aim-oriented DAMOS quota auto-tuning uses a single tuning algorithm. The algorithm is designed to find a quota value that should be consistently kept for achieving the aimed goal for long term. It is useful and reliable at automatically operating systems that have dynamic environments in the long term. As always, however, no single algorithm fits all. When the environment has static characteristics or there are control towers in not only the kernel space but also the user space, the algorithm shows some limitations. In such environments, users want kernel work in a more short term deterministic way. Actually there were at least two reports [1,2] of such cases. Extend DAMOS quotas goal to support multiple quota tuning algorithms that users can select. Keep the current algorithm as the default one, to not break the old users. Also give it a name, "consist", as it is designed to "consistently" apply the DAMOS action. And introduce a new tuning algorithm, namely "temporal". It is designed to apply the DAMOS action only temporally, in a deterministic way. In more detail, as long as the goal is under-achieved, it uses the maximum quota available. Once the goal is over-achieved, it sets the quota zero. Tests ===== I confirmed the feature is working as expected using the latest version of DAMON user-space tool, like below. $ # start DAMOS for reclaiming memory aiming 30% free memory $ sudo ./damo/damo start --damos_action pageout \ --damos_quota_goal_tuner temporal \ --damos_quota_goal node_mem_free_bp 30% 0 \ --damos_quota_interval 1s \ --damos_quota_space 100M Note that >=3.1.8 version of DAMON user-space tool supports this feature (--damos_quota_goal_tuner). As expected, DAMOS stops reclaiming memory as soon as the goal amount of free memory is made. When 'consist' tuner is used, the reclamation was continued even after the goal amount of free memory is made, resulting in more than goal amount of free memory, as expected. Patch Sequence ============== First four patches implement the features. Patch 1 extends core API to allow multiple tuners and make the current tuner as the default and only available tuner, namely 'consist'. Patch 2 allows future tuners setting zero effective quota. Patch 3 introduces the second tuner, namely 'temporal'. Patch 4 further extends DAMON sysfs API to let users use that. Three following patches (patches 5-7) update design, usage, and ABI documents, respectively. Final four patches (patches 8-11) are for adding tests. The eighth patch (patch 8) extends the kunit test for online parameters commit for validating the goal_tuner. The ninth and the tenth patches (patches 9-10) extend the testing-purpose DAMON sysfs control helper and DAMON status dumping tool to support the newly added feature. The final eleventh one (patch 11) extends the existing online commit selftest to cover the new feature. This patch (of 11): DAMOS quota goal feature utilizes a single feedback loop based algorithm for automatic tuning of the effective quota. It is useful in dynamic environments that operate systems with only kernels in the long term. But, no one fits all. It is not very easy to control in environments having more controlled characteristics and user-space control towers. We actually got multiple reports [1,2] of use cases that the algorithm is not optimal. Introduce a new field of 'struct damos_quotas', namely 'goal_tuner'. It specifies what tuning algorithm the given scheme should use, and allows DAMON API callers to set it as they want. Nonetheless, this commit introduces no new tuning algorithm but only the interface. This commit hence makes no behavioral change. A new algorithm will be added by the following commit. Link: https://lkml.kernel.org/r/20260310010529.91162-2-sj@kernel.org Link: https://lore.kernel.org/CALa+Y17__d=ZsM1yX+MXx0ozVdsXnFqF4p0g+kATEitrWyZFfg@mail.gmail.com [1] Link: https://lore.kernel.org/20260204022537.814-1-yunjeong.mun@sk.com [2] Signed-off-by: SeongJae Park Cc: Shuah Khan Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 ++++++++++ mm/damon/core.c | 1 + 2 files changed, 11 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7d0265d02954..24de35a8395a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -215,12 +215,21 @@ struct damos_quota_goal { struct list_head list; }; +/** + * enum damos_quota_goal_tuner - Goal-based quota tuning logic. + * @DAMOS_QUOTA_GOAL_TUNER_CONSIST: Aim long term consistent quota. + */ +enum damos_quota_goal_tuner { + DAMOS_QUOTA_GOAL_TUNER_CONSIST, +}; + /** * struct damos_quota - Controls the aggressiveness of the given scheme. * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. * @goals: Head of quota tuning goals (&damos_quota_goal) list. + * @goal_tuner: Goal-based @esz tuning algorithm to use. * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. @@ -262,6 +271,7 @@ struct damos_quota { unsigned long ms; unsigned long sz; struct list_head goals; + enum damos_quota_goal_tuner goal_tuner; unsigned long esz; unsigned int weight_sz; diff --git a/mm/damon/core.c b/mm/damon/core.c index 2d2332f3d377..16905bf35c40 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -912,6 +912,7 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src) err = damos_commit_quota_goals(dst, src); if (err) return err; + dst->goal_tuner = src->goal_tuner; dst->weight_sz = src->weight_sz; dst->weight_nr_accesses = src->weight_nr_accesses; dst->weight_age = src->weight_age; From 54419bbd0ee3c7fb7f3c1e3e117f0b8d15d7a896 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:18 -0700 Subject: [PATCH 223/369] mm/damon/core: allow quota goals set zero effective size quota User-explicit quotas (size and time quotas) having zero value means the quotas are unset. And, effective size quota is set as the minimum value of the explicit quotas. When quota goals are set, the goal-based quota tuner can make it lower. But the existing only single tuner never sets the effective size quota zero. Because of the fact, DAMON core assumes zero effective quota means the user has set no quota. Multiple tuners are now allowed, though. In the future, some tuners might want to set a zero effective size quota. There is no reason to restrict that. Meanwhile, because of the current implementation, it will only deactivate all quotas and make the scheme work at its full speed. Introduce a dedicated function for checking if no quota is set. The function checks the fact by showing if the user-set explicit quotas are zero and no goal is installed. It is decoupled from zero effective quota, and hence allows future tuners set zero effective quota for intentionally deactivating the scheme by a purpose. Link: https://lkml.kernel.org/r/20260310010529.91162-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 16905bf35c40..db3c59b70e49 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -387,6 +387,11 @@ void damos_destroy_quota_goal(struct damos_quota_goal *g) damos_free_quota_goal(g); } +static bool damos_quota_goals_empty(struct damos_quota *q) +{ + return list_empty(&q->goals); +} + /* initialize fields of @quota that normally API users wouldn't set */ static struct damos_quota *damos_quota_init(struct damos_quota *quota) { @@ -1791,12 +1796,24 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) r->age <= s->pattern.max_age_region; } +/* + * damos_quota_is_set() - Return if the given quota is actually set. + * @quota: The quota to check. + * + * Returns true if the quota is set, false otherwise. + */ +static bool damos_quota_is_set(struct damos_quota *quota) +{ + return quota->esz || quota->sz || quota->ms || + !damos_quota_goals_empty(quota); +} + static bool damos_valid_target(struct damon_ctx *c, struct damon_region *r, struct damos *s) { bool ret = __damos_valid_target(r, s); - if (!ret || !s->quota.esz || !c->ops.get_scheme_score) + if (!ret || !damos_quota_is_set(&s->quota) || !c->ops.get_scheme_score) return ret; return c->ops.get_scheme_score(c, r, s) >= s->quota.min_score; @@ -2066,7 +2083,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, } if (c->ops.apply_scheme) { - if (quota->esz && quota->charged_sz + sz > quota->esz) { + if (damos_quota_is_set(quota) && + quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, c->min_region_sz); if (!sz) @@ -2085,7 +2103,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); quota->charged_sz += sz; - if (quota->esz && quota->charged_sz >= quota->esz) { + if (damos_quota_is_set(quota) && + quota->charged_sz >= quota->esz) { quota->charge_target_from = t; quota->charge_addr_from = r->ar.end + 1; } @@ -2113,7 +2132,8 @@ static void damon_do_apply_schemes(struct damon_ctx *c, continue; /* Check the quota */ - if (quota->esz && quota->charged_sz >= quota->esz) + if (damos_quota_is_set(quota) && + quota->charged_sz >= quota->esz) continue; if (damos_skip_charged_region(t, r, s, c->min_region_sz)) @@ -2398,7 +2418,8 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { - if (quota->esz && quota->charged_sz >= quota->esz) + if (damos_quota_is_set(quota) && + quota->charged_sz >= quota->esz) s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; From af738a6a00c1febb0d543ba6a1400413f824ecf1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:19 -0700 Subject: [PATCH 224/369] mm/damon/core: introduce DAMOS_QUOTA_GOAL_TUNER_TEMPORAL Introduce a new goal-based DAMOS quota auto-tuning algorithm, namely DAMOS_QUOTA_GOAL_TUNER_TEMPORAL (temporal in short). The algorithm aims to trigger the DAMOS action only for a temporal time, to achieve the goal as soon as possible. For the temporal period, it uses as much quota as allowed. Once the goal is achieved, it sets the quota zero, so effectively makes the scheme be deactivated. Link: https://lkml.kernel.org/r/20260310010529.91162-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 29 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 24de35a8395a..e44e2132ccaf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -218,9 +218,11 @@ struct damos_quota_goal { /** * enum damos_quota_goal_tuner - Goal-based quota tuning logic. * @DAMOS_QUOTA_GOAL_TUNER_CONSIST: Aim long term consistent quota. + * @DAMOS_QUOTA_GOAL_TUNER_TEMPORAL: Aim zero quota asap. */ enum damos_quota_goal_tuner { DAMOS_QUOTA_GOAL_TUNER_CONSIST, + DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index db3c59b70e49..b543d1202c9d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2347,6 +2347,26 @@ static unsigned long damos_quota_score(struct damos_quota *quota) return highest_score; } +static void damos_goal_tune_esz_bp_consist(struct damos_quota *quota) +{ + unsigned long score = damos_quota_score(quota); + + quota->esz_bp = damon_feed_loop_next_input( + max(quota->esz_bp, 10000UL), score); +} + +static void damos_goal_tune_esz_bp_temporal(struct damos_quota *quota) +{ + unsigned long score = damos_quota_score(quota); + + if (score >= 10000) + quota->esz_bp = 0; + else if (quota->sz) + quota->esz_bp = quota->sz * 10000; + else + quota->esz_bp = ULONG_MAX; +} + /* * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ @@ -2361,11 +2381,10 @@ static void damos_set_effective_quota(struct damos_quota *quota) } if (!list_empty("a->goals)) { - unsigned long score = damos_quota_score(quota); - - quota->esz_bp = damon_feed_loop_next_input( - max(quota->esz_bp, 10000UL), - score); + if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_CONSIST) + damos_goal_tune_esz_bp_consist(quota); + else if (quota->goal_tuner == DAMOS_QUOTA_GOAL_TUNER_TEMPORAL) + damos_goal_tune_esz_bp_temporal(quota); esz = quota->esz_bp / 10000; } From e9a19cc85d4821a441f4b2d4756ae01e12f17393 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:20 -0700 Subject: [PATCH 225/369] mm/damon/sysfs-schemes: implement quotas->goal_tuner file Add a new DAMON sysfs interface file, namely 'goal_tuner' under the DAMOS quotas directory. It is connected to the damos_quota->goal_tuner field. Users can therefore select their favorite goal-based quotas tuning algorithm by writing the name of the tuner to the file. Reading the file returns the name of the currently selected tuner. Link: https://lkml.kernel.org/r/20260310010529.91162-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 58 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3a0782e576fa..5186966dafb3 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1488,6 +1488,7 @@ struct damon_sysfs_quotas { unsigned long sz; unsigned long reset_interval_ms; unsigned long effective_sz; /* Effective size quota in bytes */ + enum damos_quota_goal_tuner goal_tuner; }; static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) @@ -1610,6 +1611,58 @@ static ssize_t effective_bytes_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", quotas->effective_sz); } +struct damos_sysfs_qgoal_tuner_name { + enum damos_quota_goal_tuner tuner; + char *name; +}; + +static struct damos_sysfs_qgoal_tuner_name damos_sysfs_qgoal_tuner_names[] = { + { + .tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST, + .name = "consist", + }, + { + .tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, + .name = "temporal", + }, +}; + +static ssize_t goal_tuner_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_tuner_names); i++) { + struct damos_sysfs_qgoal_tuner_name *tuner_name; + + tuner_name = &damos_sysfs_qgoal_tuner_names[i]; + if (tuner_name->tuner == quotas->goal_tuner) + return sysfs_emit(buf, "%s\n", tuner_name->name); + } + return -EINVAL; +} + +static ssize_t goal_tuner_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int i; + + for (i = 0; i < ARRAY_SIZE(damos_sysfs_qgoal_tuner_names); i++) { + struct damos_sysfs_qgoal_tuner_name *tuner_name; + + tuner_name = &damos_sysfs_qgoal_tuner_names[i]; + if (sysfs_streq(buf, tuner_name->name)) { + quotas->goal_tuner = tuner_name->tuner; + return count; + } + } + return -EINVAL; +} + static void damon_sysfs_quotas_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); @@ -1627,11 +1680,15 @@ static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr = __ATTR_RO_MODE(effective_bytes, 0400); +static struct kobj_attribute damon_sysfs_quotas_goal_tuner_attr = + __ATTR_RW_MODE(goal_tuner, 0600); + static struct attribute *damon_sysfs_quotas_attrs[] = { &damon_sysfs_quotas_ms_attr.attr, &damon_sysfs_quotas_sz_attr.attr, &damon_sysfs_quotas_reset_interval_ms_attr.attr, &damon_sysfs_quotas_effective_bytes_attr.attr, + &damon_sysfs_quotas_goal_tuner_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); @@ -2718,6 +2775,7 @@ static struct damos *damon_sysfs_mk_scheme( .weight_sz = sysfs_weights->sz, .weight_nr_accesses = sysfs_weights->nr_accesses, .weight_age = sysfs_weights->age, + .goal_tuner = sysfs_quotas->goal_tuner, }; struct damos_watermarks wmarks = { .metric = sysfs_wmarks->metric, From 5a242f9daf2931fe124aa5f0f57e4a04bd2e4fa8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:21 -0700 Subject: [PATCH 226/369] Docs/mm/damon/design: document the goal-based quota tuner selections Update the design document for the newly added goal-based quota tuner selection feature. Link: https://lkml.kernel.org/r/20260310010529.91162-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index ac795f30519c..29fff20b3c2a 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -566,6 +566,18 @@ aggressiveness (the quota) of the corresponding scheme. For example, if DAMOS is under achieving the goal, DAMOS automatically increases the quota. If DAMOS is over achieving the goal, it decreases the quota. +There are two such tuning algorithms that users can select as they need. + +- ``consist``: A proportional feedback loop based algorithm. Tries to find an + optimum quota that should be consistently kept, to keep achieving the goal. + Useful for kernel-only operation on dynamic and long-running environments. + This is the default selection. If unsure, use this. +- ``temporal``: More straightforward algorithm. Tries to achieve the goal as + fast as possible, using maximum allowed quota, but only for a temporal short + time. When the quota is under-achieved, this algorithm keeps tuning quota to + a maximum allowed one. Once the quota is [over]-achieved, this sets the + quota zero. Useful for deterministic control required environments. + The goal can be specified with five parameters, namely ``target_metric``, ``target_value``, ``current_value``, ``nid`` and ``path``. The auto-tuning mechanism tries to make ``current_value`` of ``target_metric`` be same to From d9cfe515d36eb4cf065665d494a5a826139922d4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:22 -0700 Subject: [PATCH 227/369] Docs/admin-guide/mm/damon/usage: document goal_tuner sysfs file Update the DAMON usage document for the new sysfs file for the goal based quota auto-tuning algorithm selection. Link: https://lkml.kernel.org/r/20260310010529.91162-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index b0f3969b6b3b..534e1199cf09 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -83,7 +83,7 @@ comma (","). │ │ │ │ │ │ │ │ sz/min,max │ │ │ │ │ │ │ │ nr_accesses/min,max │ │ │ │ │ │ │ │ age/min,max - │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms,effective_bytes + │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms,effective_bytes,goal_tuner │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ │ :ref:`goals `/nr_goals │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid,path @@ -377,9 +377,9 @@ schemes//quotas/ The directory for the :ref:`quotas ` of the given DAMON-based operation scheme. -Under ``quotas`` directory, four files (``ms``, ``bytes``, -``reset_interval_ms``, ``effective_bytes``) and two directories (``weights`` and -``goals``) exist. +Under ``quotas`` directory, five files (``ms``, ``bytes``, +``reset_interval_ms``, ``effective_bytes`` and ``goal_tuner``) and two +directories (``weights`` and ``goals``) exist. You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and ``reset interval`` in milliseconds by writing the values to the three files, @@ -390,6 +390,14 @@ apply the action to only up to ``bytes`` bytes of memory regions within the quota limits unless at least one :ref:`goal ` is set. +You can set the goal-based effective quota auto-tuning algorithm to use, by +writing the algorithm name to ``goal_tuner`` file. Reading the file returns +the currently selected tuner algorithm. Refer to the design documentation of +:ref:`automatic quota tuning goals ` for +the background design of the feature and the name of the selectable algorithms. +Refer to :ref:`goals directory ` for the goals +setup. + The time quota is internally transformed to a size quota. Between the transformed size quota and user-specified size quota, smaller one is applied. Based on the user-specified :ref:`goal `, the From 3eda936f2a6fc3cd40ff6943179b63e8d781a9b0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:23 -0700 Subject: [PATCH 228/369] Docs/ABI/damon: update for goal_tuner Update the ABI document for the newly added goal_tuner sysfs file. Link: https://lkml.kernel.org/r/20260310010529.91162-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index f2af2ddedd32..2424237ebb10 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -316,6 +316,12 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the path parameter of the goal. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/goal_tuner +Date: Mar 2026 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the + goal-based effective quota auto-tuning algorithm to use. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/weights/sz_permil Date: Mar 2022 Contact: SeongJae Park From d972d68d506adf1abf9c52fdb8b19614ec816e70 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:24 -0700 Subject: [PATCH 229/369] mm/damon/tests/core-kunit: test goal_tuner commit Extend damos_commit_quota() kunit test for the newly added goal_tuner parameter. Link: https://lkml.kernel.org/r/20260310010529.91162-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index e86d4f4fe261..9e5904c2beeb 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -693,6 +693,7 @@ static void damos_test_commit_quota(struct kunit *test) .reset_interval = 1, .ms = 2, .sz = 3, + .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_CONSIST, .weight_sz = 4, .weight_nr_accesses = 5, .weight_age = 6, @@ -701,6 +702,7 @@ static void damos_test_commit_quota(struct kunit *test) .reset_interval = 7, .ms = 8, .sz = 9, + .goal_tuner = DAMOS_QUOTA_GOAL_TUNER_TEMPORAL, .weight_sz = 10, .weight_nr_accesses = 11, .weight_age = 12, @@ -714,6 +716,7 @@ static void damos_test_commit_quota(struct kunit *test) KUNIT_EXPECT_EQ(test, dst.reset_interval, src.reset_interval); KUNIT_EXPECT_EQ(test, dst.ms, src.ms); KUNIT_EXPECT_EQ(test, dst.sz, src.sz); + KUNIT_EXPECT_EQ(test, dst.goal_tuner, src.goal_tuner); KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz); KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses); KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age); From c00863bc7cc3c30d8beb9d10f90aa19d799ab386 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:25 -0700 Subject: [PATCH 230/369] selftests/damon/_damon_sysfs: support goal_tuner setup Add support of goal_tuner setup to the test-purpose DAMON sysfs interface control helper, _damon_sysfs.py. Link: https://lkml.kernel.org/r/20260310010529.91162-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 748778b563cd..2b4df655d9fd 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -130,15 +130,16 @@ class DamosQuota: sz = None # size quota, in bytes ms = None # time quota goals = None # quota goals + goal_tuner = None # quota goal tuner reset_interval_ms = None # quota reset interval weight_sz_permil = None weight_nr_accesses_permil = None weight_age_permil = None scheme = None # owner scheme - def __init__(self, sz=0, ms=0, goals=None, reset_interval_ms=0, - weight_sz_permil=0, weight_nr_accesses_permil=0, - weight_age_permil=0): + def __init__(self, sz=0, ms=0, goals=None, goal_tuner='consist', + reset_interval_ms=0, weight_sz_permil=0, + weight_nr_accesses_permil=0, weight_age_permil=0): self.sz = sz self.ms = ms self.reset_interval_ms = reset_interval_ms @@ -146,6 +147,7 @@ class DamosQuota: self.weight_nr_accesses_permil = weight_nr_accesses_permil self.weight_age_permil = weight_age_permil self.goals = goals if goals is not None else [] + self.goal_tuner = goal_tuner for idx, goal in enumerate(self.goals): goal.idx = idx goal.quota = self @@ -191,6 +193,10 @@ class DamosQuota: err = goal.stage() if err is not None: return err + err = write_file( + os.path.join(self.sysfs_dir(), 'goal_tuner'), self.goal_tuner) + if err is not None: + return err return None class DamosWatermarks: From c2b0cb96e787a2f053003f4ea966cecdcc41e5d9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:26 -0700 Subject: [PATCH 231/369] selftests/damon/drgn_dump_damon_status: support quota goal_tuner dumping Update drgn_dump_damon_status.py, which is being used to dump the in-kernel DAMON status for tests, to dump goal_tuner setup status. Link: https://lkml.kernel.org/r/20260310010529.91162-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/drgn_dump_damon_status.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index 5374d18d1fa8..af99b07a4f56 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -110,6 +110,7 @@ def damos_quota_to_dict(quota): ['reset_interval', int], ['ms', int], ['sz', int], ['goals', damos_quota_goals_to_list], + ['goal_tuner', int], ['esz', int], ['weight_sz', int], ['weight_nr_accesses', int], From ddac713da3bcd117a4ee4d184a34f02582495e7d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 Mar 2026 18:05:27 -0700 Subject: [PATCH 232/369] selftests/damon/sysfs.py: test goal_tuner commit Extend the near-full DAMON parameters commit selftest to commit goal_tuner and confirm the internal status is updated as expected. Link: https://lkml.kernel.org/r/20260310010529.91162-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 9cca71eb0325..3aa5c91548a5 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -67,6 +67,12 @@ def assert_quota_committed(quota, dump): assert_true(dump['sz'] == quota.sz, 'sz', dump) for idx, qgoal in enumerate(quota.goals): assert_quota_goal_committed(qgoal, dump['goals'][idx]) + tuner_val = { + 'consist': 0, + 'temporal': 1, + } + assert_true(dump['goal_tuner'] == tuner_val[quota.goal_tuner], + 'goal_tuner', dump) assert_true(dump['weight_sz'] == quota.weight_sz_permil, 'weight_sz', dump) assert_true(dump['weight_nr_accesses'] == quota.weight_nr_accesses_permil, 'weight_nr_accesses', dump) @@ -231,6 +237,7 @@ def main(): metric='node_mem_used_bp', target_value=9950, nid=1)], + goal_tuner='temporal', reset_interval_ms=1500, weight_sz_permil=20, weight_nr_accesses_permil=200, From c82aade08c3b4f51029fdcdccb7b479facec0ed3 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 17 Mar 2026 08:33:56 -0700 Subject: [PATCH 233/369] mm: khugepaged: export set_recommended_min_free_kbytes() Patch series "mm: thp: reduce unnecessary start_stop_khugepaged()", v7. Writing to /sys/kernel/mm/transparent_hugepage/enabled causes start_stop_khugepaged() called independent of any change. start_stop_khugepaged() SPAMs the printk ring buffer overflow with the exact same message, even when nothing changes. For instance, if you have a custom vm.min_free_kbytes, just touching /sys/kernel/mm/transparent_hugepage/enabled causes a printk message. Example: # sysctl -w vm.min_free_kbytes=112382 # for i in $(seq 100); do echo never > /sys/kernel/mm/transparent_hugepage/enabled ; done and you have 100 WARN messages like the following, which is pretty dull: khugepaged: min_free_kbytes is not updated to 112381 because user defined value 112382 is preferred A similar message shows up when setting thp to "always": # for i in $(seq 100); do # echo 1024 > /proc/sys/vm/min_free_kbytes # echo always > /sys/kernel/mm/transparent_hugepage/enabled # done And then, we have 100 messages like: khugepaged: raising min_free_kbytes from 1024 to 67584 to help transparent hugepage allocations This is more common when you have a configuration management system that writes the THP configuration without an extra read, assuming that nothing will happen if there is no change in the configuration, but it prints these annoying messages. For instance, at Meta's fleet, ~10K servers were producing 3.5M of these messages per day. Fix this by making the sysfs _store helpers easier to digest and ratelimiting the message. This patch (of 4): Make set_recommended_min_free_kbytes() callable from outside khugepaged.c by removing the static qualifier and adding a declaration in mm/internal.h. This allows callers that change THP settings to recalculate watermarks without going through start_stop_khugepaged(). Link: https://lkml.kernel.org/r/20260317-thp_logs-v7-0-31eb98fa5a8b@debian.org Link: https://lkml.kernel.org/r/20260317-thp_logs-v7-1-31eb98fa5a8b@debian.org Signed-off-by: Breno Leitao Suggested-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/internal.h | 5 +++++ mm/khugepaged.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index ebb68ad10d5c..f50a0376b87e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -640,6 +640,11 @@ int user_proactive_reclaim(char *buf, */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); +/* + * in mm/khugepaged.c + */ +void set_recommended_min_free_kbytes(void); + /* * in mm/page_alloc.c */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ab97423fe837..e1eb3c7e59c3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2660,7 +2660,7 @@ static int khugepaged(void *none) return 0; } -static void set_recommended_min_free_kbytes(void) +void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; From 82d9ff648c6c9fa00b3f31107e63127572cca741 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 17 Mar 2026 08:33:57 -0700 Subject: [PATCH 234/369] mm: huge_memory: refactor anon_enabled_store() with set_anon_enabled_mode() Consolidate the repeated spin_lock/set_bit/clear_bit pattern in anon_enabled_store() into a new set_anon_enabled_mode() helper that loops over an orders[] array, setting the bit for the selected mode and clearing the others. Introduce enum anon_enabled_mode and anon_enabled_mode_strings[] for the per-order anon THP setting. Use sysfs_match_string() with the anon_enabled_mode_strings[] table to replace the if/else chain of sysfs_streq() calls. The helper uses __test_and_set_bit()/__test_and_clear_bit() to track whether the state actually changed, so start_stop_khugepaged() is only called when needed. When the mode is unchanged, set_recommended_min_free_kbytes() is called directly to preserve the watermark recalculation behavior of the original code. Link: https://lkml.kernel.org/r/20260317-thp_logs-v7-2-31eb98fa5a8b@debian.org Signed-off-by: Breno Leitao Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Baolin Wang Cc: Barry Song Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 84 ++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a132fb98ed5d..211d8c892318 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -316,6 +316,20 @@ static ssize_t enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } +enum anon_enabled_mode { + ANON_ENABLED_ALWAYS = 0, + ANON_ENABLED_INHERIT = 1, + ANON_ENABLED_MADVISE = 2, + ANON_ENABLED_NEVER = 3, +}; + +static const char * const anon_enabled_mode_strings[] = { + [ANON_ENABLED_ALWAYS] = "always", + [ANON_ENABLED_INHERIT] = "inherit", + [ANON_ENABLED_MADVISE] = "madvise", + [ANON_ENABLED_NEVER] = "never", +}; + static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -515,48 +529,54 @@ static ssize_t anon_enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } +static bool set_anon_enabled_mode(int order, enum anon_enabled_mode mode) +{ + static unsigned long *enabled_orders[] = { + &huge_anon_orders_always, + &huge_anon_orders_inherit, + &huge_anon_orders_madvise, + }; + enum anon_enabled_mode m; + bool changed = false; + + spin_lock(&huge_anon_orders_lock); + for (m = 0; m < ARRAY_SIZE(enabled_orders); m++) { + if (m == mode) + changed |= !__test_and_set_bit(order, enabled_orders[m]); + else + changed |= __test_and_clear_bit(order, enabled_orders[m]); + } + spin_unlock(&huge_anon_orders_lock); + + return changed; +} + static ssize_t anon_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; - ssize_t ret = count; + int mode; - if (sysfs_streq(buf, "always")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_inherit); - clear_bit(order, &huge_anon_orders_madvise); - set_bit(order, &huge_anon_orders_always); - spin_unlock(&huge_anon_orders_lock); - } else if (sysfs_streq(buf, "inherit")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_always); - clear_bit(order, &huge_anon_orders_madvise); - set_bit(order, &huge_anon_orders_inherit); - spin_unlock(&huge_anon_orders_lock); - } else if (sysfs_streq(buf, "madvise")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_always); - clear_bit(order, &huge_anon_orders_inherit); - set_bit(order, &huge_anon_orders_madvise); - spin_unlock(&huge_anon_orders_lock); - } else if (sysfs_streq(buf, "never")) { - spin_lock(&huge_anon_orders_lock); - clear_bit(order, &huge_anon_orders_always); - clear_bit(order, &huge_anon_orders_inherit); - clear_bit(order, &huge_anon_orders_madvise); - spin_unlock(&huge_anon_orders_lock); - } else - ret = -EINVAL; + mode = sysfs_match_string(anon_enabled_mode_strings, buf); + if (mode < 0) + return -EINVAL; - if (ret > 0) { - int err; + if (set_anon_enabled_mode(order, mode)) { + int err = start_stop_khugepaged(); - err = start_stop_khugepaged(); if (err) - ret = err; + return err; + } else { + /* + * Recalculate watermarks even when the mode didn't + * change, as the previous code always called + * start_stop_khugepaged() which does this internally. + */ + set_recommended_min_free_kbytes(); } - return ret; + + return count; } static struct kobj_attribute anon_enabled_attr = From 35a01d94420e1c2b5b1b421d1471ae33f48ae13a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 17 Mar 2026 08:33:58 -0700 Subject: [PATCH 235/369] mm: huge_memory: refactor enabled_store() with set_global_enabled_mode() Refactor enabled_store() to use a new set_global_enabled_mode() helper. Introduce a separate enum global_enabled_mode and global_enabled_mode_strings[], mirroring the anon_enabled_mode pattern from the previous commit. A separate enum is necessary because the global THP setting does not support "inherit", only "always", "madvise", and "never". Reusing anon_enabled_mode would leave a NULL gap in the string array, causing sysfs_match_string() to stop early and fail to match entries after the gap. The helper uses the same loop pattern as set_anon_enabled_mode(), iterating over an array of flag bit positions and using test_and_set_bit()/test_and_clear_bit() to track whether the state actually changed. Link: https://lkml.kernel.org/r/20260317-thp_logs-v7-3-31eb98fa5a8b@debian.org Signed-off-by: Breno Leitao Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Wei Yang Acked-by: David Hildenbrand (Arm) Cc: Barry Song Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/huge_memory.c | 63 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 211d8c892318..9fea52ccad56 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -330,30 +330,63 @@ static const char * const anon_enabled_mode_strings[] = { [ANON_ENABLED_NEVER] = "never", }; +enum global_enabled_mode { + GLOBAL_ENABLED_ALWAYS = 0, + GLOBAL_ENABLED_MADVISE = 1, + GLOBAL_ENABLED_NEVER = 2, +}; + +static const char * const global_enabled_mode_strings[] = { + [GLOBAL_ENABLED_ALWAYS] = "always", + [GLOBAL_ENABLED_MADVISE] = "madvise", + [GLOBAL_ENABLED_NEVER] = "never", +}; + +static bool set_global_enabled_mode(enum global_enabled_mode mode) +{ + static const unsigned long thp_flags[] = { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + }; + enum global_enabled_mode m; + bool changed = false; + + for (m = 0; m < ARRAY_SIZE(thp_flags); m++) { + if (m == mode) + changed |= !test_and_set_bit(thp_flags[m], + &transparent_hugepage_flags); + else + changed |= test_and_clear_bit(thp_flags[m], + &transparent_hugepage_flags); + } + + return changed; +} + static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - ssize_t ret = count; + int mode; - if (sysfs_streq(buf, "always")) { - clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "madvise")) { - clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else if (sysfs_streq(buf, "never")) { - clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else - ret = -EINVAL; + mode = sysfs_match_string(global_enabled_mode_strings, buf); + if (mode < 0) + return -EINVAL; - if (ret > 0) { + if (set_global_enabled_mode(mode)) { int err = start_stop_khugepaged(); + if (err) - ret = err; + return err; + } else { + /* + * Recalculate watermarks even when the mode didn't + * change, as the previous code always called + * start_stop_khugepaged() which does this internally. + */ + set_recommended_min_free_kbytes(); } - return ret; + return count; } static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); From 3203a8706e81b8d73712d46952b96eb28238c45e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 17 Mar 2026 08:33:59 -0700 Subject: [PATCH 236/369] mm: ratelimit min_free_kbytes adjustment messages The "raising min_free_kbytes" pr_info message in set_recommended_min_free_kbytes() and the "min_free_kbytes is not updated to" pr_warn in calculate_min_free_kbytes() can spam the kernel log when called repeatedly. Switch the pr_info in set_recommended_min_free_kbytes() and the pr_warn in calculate_min_free_kbytes() to their _ratelimited variants to prevent the log spam for this message. Link: https://lkml.kernel.org/r/20260317-thp_logs-v7-4-31eb98fa5a8b@debian.org Signed-off-by: Breno Leitao Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Reviewed-by: Baolin Wang Acked-by: Zi Yan Cc: Barry Song Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Usama Arif Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- mm/page_alloc.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e1eb3c7e59c3..f972a9a65e3a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2701,8 +2701,8 @@ void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", - min_free_kbytes, recommended_min); + pr_info_ratelimited("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fdcc2fde565b..937e9b850709 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6514,8 +6514,8 @@ void calculate_min_free_kbytes(void) if (new_min_free_kbytes > user_min_free_kbytes) min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); else - pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", - new_min_free_kbytes, user_min_free_kbytes); + pr_warn_ratelimited("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); } From f08f610ea00b75d6174cd34950a8338d51a0729f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 11 Mar 2026 20:07:37 +0200 Subject: [PATCH 237/369] selftests/mm: pagemap_ioctl: remove hungarian notation Replace lpBaseAddress with addr and dwRegionSize with size. Link: https://lkml.kernel.org/r/20260311180737.3767545-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 2ca8a7e3c27e..7f9428d6062c 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -113,13 +113,13 @@ int init_uffd(void) return 0; } -int wp_init(void *lpBaseAddress, long dwRegionSize) +int wp_init(void *addr, long size) { struct uffdio_register uffdio_register; struct uffdio_writeprotect wp; - uffdio_register.range.start = (unsigned long)lpBaseAddress; - uffdio_register.range.len = dwRegionSize; + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = size; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) ksft_exit_fail_msg("ioctl(UFFDIO_REGISTER) %d %s\n", errno, strerror(errno)); @@ -127,8 +127,8 @@ int wp_init(void *lpBaseAddress, long dwRegionSize) if (!(uffdio_register.ioctls & UFFDIO_WRITEPROTECT)) ksft_exit_fail_msg("ioctl set is incorrect\n"); - wp.range.start = (unsigned long)lpBaseAddress; - wp.range.len = dwRegionSize; + wp.range.start = (unsigned long)addr; + wp.range.len = size; wp.mode = UFFDIO_WRITEPROTECT_MODE_WP; if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp)) @@ -137,21 +137,21 @@ int wp_init(void *lpBaseAddress, long dwRegionSize) return 0; } -int wp_free(void *lpBaseAddress, long dwRegionSize) +int wp_free(void *addr, long size) { struct uffdio_register uffdio_register; - uffdio_register.range.start = (unsigned long)lpBaseAddress; - uffdio_register.range.len = dwRegionSize; + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = size; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) ksft_exit_fail_msg("ioctl unregister failure\n"); return 0; } -int wp_addr_range(void *lpBaseAddress, int dwRegionSize) +int wp_addr_range(void *addr, int size) { - if (pagemap_ioctl(lpBaseAddress, dwRegionSize, NULL, 0, + if (pagemap_ioctl(addr, size, NULL, 0, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0) ksft_exit_fail_msg("error %d %d %s\n", 1, errno, strerror(errno)); From 2d028f3e4bbbfd448928a8d3d2814b0b04c214f4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 11 Mar 2026 16:05:26 -0400 Subject: [PATCH 238/369] selftest: memcg: skip memcg_sock test if address family not supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_memcg_sock test in memcontrol.c sets up an IPv6 socket and send data over it to consume memory and verify that memory.stat.sock and memory.current values are close. On systems where IPv6 isn't enabled or not configured to support SOCK_STREAM, the test_memcg_sock test always fails. When the socket() call fails, there is no way we can test the memory consumption and verify the above claim. I believe it is better to just skip the test in this case instead of reporting a test failure hinting that there may be something wrong with the memcg code. Link: https://lkml.kernel.org/r/20260311200526.885899-1-longman@redhat.com Fixes: 5f8f019380b8 ("selftests: cgroup/memcontrol: add basic test for socket accounting") Signed-off-by: Waiman Long Acked-by: Michal Koutný Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Mike Rapoport Cc: Muchun Song Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 2fb096a2a9f9..a25eb097b31c 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1280,8 +1280,11 @@ static int tcp_server(const char *cgroup, void *arg) saddr.sin6_port = htons(srv_args->port); sk = socket(AF_INET6, SOCK_STREAM, 0); - if (sk < 0) + if (sk < 0) { + /* Pass back errno to the ctl_fd */ + write(ctl_fd, &errno, sizeof(errno)); return ret; + } if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) goto cleanup; @@ -1412,6 +1415,12 @@ static int test_memcg_sock(const char *root) goto cleanup; close(args.ctl[0]); + /* Skip if address family not supported by protocol */ + if (err == EAFNOSUPPORT) { + ret = KSFT_SKIP; + goto cleanup; + } + if (!err) break; if (err != EADDRINUSE) From a2e0c0668a3486f96b86c50e02872c8e94fd4f9c Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 12 Mar 2026 03:47:23 -0700 Subject: [PATCH 239/369] mm: migrate: requeue destination folio on deferred split queue During folio migration, __folio_migrate_mapping() removes the source folio from the deferred split queue, but the destination folio is never re-queued. This causes underutilized THPs to escape the shrinker after NUMA migration, since they silently drop off the deferred split list. Fix this by recording whether the source folio was on the deferred split queue and its partially mapped state before move_to_new_folio() unqueues it, and re-queuing the destination folio after a successful migration if it was. By the time migrate_folio_move() runs, partially mapped folios without a pin have already been split by migrate_pages_batch(). So only two cases remain on the deferred list at this point: 1. Partially mapped folios with a pin (split failed). 2. Fully mapped but potentially underused folios. The recorded partially_mapped state is forwarded to deferred_split_folio() so that the destination folio is correctly re-queued in both cases. Because THPs are removed from the deferred_list, THP shinker cannot split the underutilized THPs in time. As a result, users will show less free memory than before. Link: https://lkml.kernel.org/r/20260312104723.1351321-1-usama.arif@linux.dev Fixes: dafff3f4c850 ("mm: split underused THPs") Signed-off-by: Usama Arif Reported-by: Johannes Weiner Acked-by: Johannes Weiner Acked-by: Zi Yan Acked-by: David Hildenbrand (Arm) Acked-by: SeongJae Park Reviewed-by: Wei Yang Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Joshua Hahn Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Nico Pache Cc: Rakie Kim Cc: Ying Huang Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 6cc654858da6..3323fc96b1cd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1358,6 +1358,8 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, int rc; int old_page_state = 0; struct anon_vma *anon_vma = NULL; + bool src_deferred_split = false; + bool src_partially_mapped = false; struct list_head *prev; __migrate_folio_extract(dst, &old_page_state, &anon_vma); @@ -1371,6 +1373,12 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, goto out_unlock_both; } + if (folio_order(src) > 1 && + !data_race(list_empty(&src->_deferred_list))) { + src_deferred_split = true; + src_partially_mapped = folio_test_partially_mapped(src); + } + rc = move_to_new_folio(dst, src, mode); if (rc) goto out; @@ -1391,6 +1399,15 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, if (old_page_state & PAGE_WAS_MAPPED) remove_migration_ptes(src, dst, 0); + /* + * Requeue the destination folio on the deferred split queue if + * the source was on the queue. The source is unqueued in + * __folio_migrate_mapping(), so we recorded the state from + * before move_to_new_folio(). + */ + if (src_deferred_split) + deferred_split_folio(dst, src_partially_mapped); + out_unlock_both: folio_unlock(dst); folio_set_owner_migrate_reason(dst, reason); From d4e981b280454f4368950db6269c6077d66453cf Mon Sep 17 00:00:00 2001 From: Kexin Sun Date: Thu, 12 Mar 2026 13:38:12 +0800 Subject: [PATCH 240/369] kasan: update outdated comment kmalloc_large() was renamed kmalloc_large_noprof() by commit 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends"), and subsequently renamed __kmalloc_large_noprof() by commit a0a44d9175b3 ("mm, slab: don't wrap internal functions with alloc_hooks()"), making it an internal implementation detail. Large kmalloc allocations are now performed through the public kmalloc() interface directly, making the reference to KMALLOC_MAX_SIZE also stale (KMALLOC_MAX_CACHE_SIZE would be more accurate). Remove the references to kmalloc_large() and KMALLOC_MAX_SIZE, and rephrase the description for large kmalloc allocations. Link: https://lkml.kernel.org/r/20260312053812.1365-1-kexinsun@smail.nju.edu.cn Signed-off-by: Kexin Sun Suggested-by: Andrey Ryabinin Assisted-by: unnamed:deepseek-v3.2 coccinelle Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Julia Lawall Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 338a1921a50a..bf233bde68c7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -352,8 +352,8 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); * kasan_mempool_unpoison_object(). * * This function operates on all slab allocations including large kmalloc - * allocations (the ones returned by kmalloc_large() or by kmalloc() with the - * size > KMALLOC_MAX_SIZE). + * allocations (i.e. the ones backed directly by the buddy allocator rather + * than kmalloc slab caches). * * Return: true if the allocation can be safely reused; false otherwise. */ @@ -381,8 +381,8 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); * original tags based on the pointer value. * * This function operates on all slab allocations including large kmalloc - * allocations (the ones returned by kmalloc_large() or by kmalloc() with the - * size > KMALLOC_MAX_SIZE). + * allocations (i.e. the ones backed directly by the buddy allocator rather + * than kmalloc slab caches). */ static __always_inline void kasan_mempool_unpoison_object(void *ptr, size_t size) From 9b9b8d4aebf1eb8fe22293dcfc38c600a7e7859b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Wed, 11 Mar 2026 17:24:36 +0000 Subject: [PATCH 241/369] mm/mremap: correct invalid map count check Patch series "mm: improve map count checks". Firstly, in mremap(), it appears that our map count checks have been overly conservative - there is simply no reason to require that we have headroom of 4 mappings prior to moving the VMA, we only need headroom of 2 VMAs since commit 659ace584e7a ("mmap: don't return ENOMEM when mapcount is temporarily exceeded in munmap()"). Likely the original headroom of 4 mappings was a mistake, and 3 was actually intended. Next, we access sysctl_max_map_count in a number of places without being all that careful about how we do so. We introduce a simple helper that READ_ONCE()'s the field (get_sysctl_max_map_count()) to ensure that the field is accessed correctly. The WRITE_ONCE() side is already handled by the sysctl procfs code in proc_int_conv(). We also move this field to internal.h as there's no reason for anybody else to access it outside of mm. Unfortunately we have to maintain the extern variable, as mmap.c implements the procfs code. Finally, we are accessing current->mm->map_count without holding the mmap write lock, which is also not correct, so this series ensures the lock is head before we access it. We also abstract the check to a helper function, and add ASCII diagrams to explain why we're doing what we're doing. This patch (of 3): We currently check to see, if on moving a VMA when doing mremap(), if it might violate the sys.vm.max_map_count limit. This was introduced in the mists of time prior to 2.6.12. At this point in time, as now, the move_vma() operation would copy the VMA (+1 mapping if not merged), then potentially split the source VMA upon unmap. Prior to commit 659ace584e7a ("mmap: don't return ENOMEM when mapcount is temporarily exceeded in munmap()"), a VMA split would check whether mm->map_count >= sysctl_max_map_count prior to a split before it ran. On unmap of the source VMA, if we are moving a partial VMA, we might split the VMA twice. This would mean, on invocation of split_vma() (as was), we'd check whether mm->map_count >= sysctl_max_map_count with a map count elevated by one, then again with a map count elevated by two, ending up with a map count elevated by three. At this point we'd reduce the map count on unmap. At the start of move_vma(), there was a check that has remained throughout mremap()'s history of mm->map_count >= sysctl_max_map_count - 3 (which implies mm->mmap_count + 4 > sysctl_max_map_count - that is, we must have headroom for 4 additional mappings). After mm->map_count is elevated by 3, it is decremented by one once the unmap completes. The mmap write lock is held, so nothing else will observe mm->map_count > sysctl_max_map_count. It appears this check was always incorrect - it should have either be one of 'mm->map_count > sysctl_max_map_count - 3' or 'mm->map_count >= sysctl_max_map_count - 2'. After commit 659ace584e7a ("mmap: don't return ENOMEM when mapcount is temporarily exceeded in munmap()"), the map count check on split is eliminated in the newly introduced __split_vma(), which the unmap path uses, and has that path check whether mm->map_count >= sysctl_max_map_count. This is valid since, net, an unmap can only cause an increase in map count of 1 (split both sides, unmap middle). Since we only copy a VMA and (if MREMAP_DONTUNMAP is not set) unmap afterwards, the maximum number of additional mappings that will actually be subject to any check will be 2. Therefore, update the check to assert this corrected value. Additionally, update the check introduced by commit ea2c3f6f5545 ("mm,mremap: bail out earlier in mremap_to under map pressure") to account for this. While we're here, clean up the comment prior to that. Link: https://lkml.kernel.org/r/cover.1773249037.git.ljs@kernel.org Link: https://lkml.kernel.org/r/73e218c67dcd197c5331840fb011e2c17155bfb0.1773249037.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Jianzhou Zhao Signed-off-by: Andrew Morton --- mm/mremap.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 2be876a70cc0..e8c3021dd841 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1041,10 +1041,11 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) vm_flags_t dummy = vma->vm_flags; /* - * We'd prefer to avoid failure later on in do_munmap: - * which may split one vma into three before unmapping. + * We'd prefer to avoid failure later on in do_munmap: we copy a VMA, + * which may not merge, then (if MREMAP_DONTUNMAP is not set) unmap the + * source, which may split, causing a net increase of 2 mappings. */ - if (current->mm->map_count >= sysctl_max_map_count - 3) + if (current->mm->map_count + 2 > sysctl_max_map_count) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { @@ -1804,20 +1805,15 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) return -EINVAL; /* - * move_vma() need us to stay 4 maps below the threshold, otherwise - * it will bail out at the very beginning. - * That is a problem if we have already unmapped the regions here - * (new_addr, and old_addr), because userspace will not know the - * state of the vma's after it gets -ENOMEM. - * So, to avoid such scenario we can pre-compute if the whole - * operation has high chances to success map-wise. - * Worst-scenario case is when both vma's (new_addr and old_addr) get - * split in 3 before unmapping it. - * That means 2 more maps (1 for each) to the ones we already hold. - * Check whether current map count plus 2 still leads us to 4 maps below - * the threshold, otherwise return -ENOMEM here to be more safe. + * We may unmap twice before invoking move_vma(), that is if new_len < + * old_len (shrinking), and in the MREMAP_FIXED case, unmapping part of + * a VMA located at the destination. + * + * In the worst case, both unmappings will cause splits, resulting in a + * net increased map count of 2. In move_vma() we check for headroom of + * 2 additional mappings, so check early to avoid bailing out then. */ - if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) + if (current->mm->map_count + 4 > sysctl_max_map_count) return -ENOMEM; return 0; From 2d1e54aab6fd01f7502af20e125312e06a15bf9c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Wed, 11 Mar 2026 17:24:37 +0000 Subject: [PATCH 242/369] mm: abstract reading sysctl_max_map_count, and READ_ONCE() Concurrent reads and writes of sysctl_max_map_count are possible, so we should READ_ONCE() and WRITE_ONCE(). The sysctl procfs logic already enforces WRITE_ONCE(), so abstract the read side with get_sysctl_max_map_count(). While we're here, also move the field to mm/internal.h and add the getter there since only mm interacts with it, there's no need for anybody else to have access. Finally, update the VMA userland tests to reflect the change. Link: https://lkml.kernel.org/r/0715259eb37cbdfde4f9e5db92a20ec7110a1ce5.1773249037.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Jianzhou Zhao Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/internal.h | 6 ++++++ mm/mmap.c | 2 +- mm/mremap.c | 4 ++-- mm/nommu.c | 2 +- mm/vma.c | 6 +++--- tools/testing/vma/include/custom.h | 3 --- tools/testing/vma/include/dup.h | 9 +++++++++ tools/testing/vma/main.c | 2 ++ 9 files changed, 24 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index efb8be5d259c..25ba5816e02b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -207,8 +207,6 @@ static inline void __mm_zero_struct_page(struct page *page) #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -extern int sysctl_max_map_count; - extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/mm/internal.h b/mm/internal.h index f50a0376b87e..62d80fd37ae1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1863,4 +1863,10 @@ static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, #endif /* CONFIG_MMU_NOTIFIER */ +extern int sysctl_max_map_count; +static inline int get_sysctl_max_map_count(void) +{ + return READ_ONCE(sysctl_max_map_count); +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 843160946aa5..79544d893411 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -375,7 +375,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EOVERFLOW; /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; /* diff --git a/mm/mremap.c b/mm/mremap.c index e8c3021dd841..ba6c690f6c1b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1045,7 +1045,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) * which may not merge, then (if MREMAP_DONTUNMAP is not set) unmap the * source, which may split, causing a net increase of 2 mappings. */ - if (current->mm->map_count + 2 > sysctl_max_map_count) + if (current->mm->map_count + 2 > get_sysctl_max_map_count()) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { @@ -1813,7 +1813,7 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) * net increased map count of 2. In move_vma() we check for headroom of * 2 additional mappings, so check early to avoid bailing out then. */ - if (current->mm->map_count + 4 > sysctl_max_map_count) + if (current->mm->map_count + 4 > get_sysctl_max_map_count()) return -ENOMEM; return 0; diff --git a/mm/nommu.c b/mm/nommu.c index c3a23b082adb..ed3934bc2de4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1317,7 +1317,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; mm = vma->vm_mm; - if (mm->map_count >= sysctl_max_map_count) + if (mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); diff --git a/mm/vma.c b/mm/vma.c index b7055c264b5d..4d21e7d8e93c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -590,7 +590,7 @@ out_free_vma: static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (vma->vm_mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= get_sysctl_max_map_count()) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); @@ -1394,7 +1394,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && - vms->vma->vm_mm->map_count >= sysctl_max_map_count) { + vms->vma->vm_mm->map_count >= get_sysctl_max_map_count()) { error = -ENOMEM; goto map_count_exceeded; } @@ -2868,7 +2868,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; - if (mm->map_count > sysctl_max_map_count) + if (mm->map_count > get_sysctl_max_map_count()) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 7150e09122b2..6c62a38a2f6f 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -21,9 +21,6 @@ extern unsigned long dac_mmap_min_addr; #define VM_BUG_ON(_expr) (BUG_ON(_expr)) #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) -/* We hardcode this for now. */ -#define sysctl_max_map_count 0x1000000UL - #define TASK_SIZE ((1ul << 47)-PAGE_SIZE) /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 5eb313beb43d..8865ffe046d8 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -419,6 +419,9 @@ struct vma_iterator { #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + /* What action should be taken after an .mmap_prepare call is complete? */ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ @@ -1342,3 +1345,9 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) swap(vma->vm_file, file); fput(file); } + +extern int sysctl_max_map_count; +static inline int get_sysctl_max_map_count(void) +{ + return READ_ONCE(sysctl_max_map_count); +} diff --git a/tools/testing/vma/main.c b/tools/testing/vma/main.c index 49b09e97a51f..18338f5d29e0 100644 --- a/tools/testing/vma/main.c +++ b/tools/testing/vma/main.c @@ -14,6 +14,8 @@ #include "tests/mmap.c" #include "tests/vma.c" +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; + /* Helper functions which utilise static kernel functions. */ struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg) From 0289955fc548525aa6c4b12ec36afbb7283725fb Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Wed, 11 Mar 2026 17:24:38 +0000 Subject: [PATCH 243/369] mm/mremap: check map count under mmap write lock and abstract We are checking the mmap count in check_mremap_params(), prior to obtaining an mmap write lock, which means that accesses to current->mm->map_count might race with this field being updated. Resolve this by only checking this field after the mmap write lock is held. Additionally, abstract this check into a helper function with extensive ASCII documentation of what's going on. Link: https://lkml.kernel.org/r/18be0b48eaa8e8804eb745974ee729c3ade0c687.1773249037.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reported-by: Jianzhou Zhao Closes: https://lore.kernel.org/all/1a7d4c26.6b46.19cdbe7eaf0.Coremail.luckd0g@163.com/ Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mremap.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 13 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index ba6c690f6c1b..ee46bbb031e6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1028,6 +1028,75 @@ static void vrm_stat_account(struct vma_remap_struct *vrm, mm->locked_vm += pages; } +static bool __check_map_count_against_split(struct mm_struct *mm, + bool before_unmaps) +{ + const int sys_map_count = get_sysctl_max_map_count(); + int map_count = mm->map_count; + + mmap_assert_write_locked(mm); + + /* + * At the point of shrinking the VMA, if new_len < old_len, we unmap + * thusly in the worst case: + * + * old_addr+old_len old_addr+old_len + * |---------------.----.---------| |---------------| |---------| + * | . . | -> | +1 | -1 | +1 | + * |---------------.----.---------| |---------------| |---------| + * old_addr+new_len old_addr+new_len + * + * At the point of removing the portion of an existing VMA to make space + * for the moved VMA if MREMAP_FIXED, we unmap thusly in the worst case: + * + * new_addr new_addr+new_len new_addr new_addr+new_len + * |----.---------------.---------| |----| |---------| + * | . . | -> | +1 | -1 | +1 | + * |----.---------------.---------| |----| |---------| + * + * Therefore, before we consider the move anything, we have to account + * for 2 additional VMAs possibly being created upon these unmappings. + */ + if (before_unmaps) + map_count += 2; + + /* + * At the point of MOVING the VMA: + * + * We start by copying a VMA, which creates an additional VMA if no + * merge occurs, then if not MREMAP_DONTUNMAP, we unmap the source VMA. + * In the worst case we might then observe: + * + * new_addr new_addr+new_len new_addr new_addr+new_len + * |----| |---------| |----|---------------|---------| + * | | | | -> | | +1 | | + * |----| |---------| |----|---------------|---------| + * + * old_addr old_addr+old_len old_addr old_addr+old_len + * |----.---------------.---------| |----| |---------| + * | . . | -> | +1 | -1 | +1 | + * |----.---------------.---------| |----| |---------| + * + * Therefore we must check to ensure we have headroom of 2 additional + * VMAs. + */ + return map_count + 2 <= sys_map_count; +} + +/* Do we violate the map count limit if we split VMAs when moving the VMA? */ +static bool check_map_count_against_split(void) +{ + return __check_map_count_against_split(current->mm, + /*before_unmaps=*/false); +} + +/* Do we violate the map count limit if we split VMAs prior to early unmaps? */ +static bool check_map_count_against_split_early(void) +{ + return __check_map_count_against_split(current->mm, + /*before_unmaps=*/true); +} + /* * Perform checks before attempting to write a VMA prior to it being * moved. @@ -1045,7 +1114,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) * which may not merge, then (if MREMAP_DONTUNMAP is not set) unmap the * source, which may split, causing a net increase of 2 mappings. */ - if (current->mm->map_count + 2 > get_sysctl_max_map_count()) + if (!check_map_count_against_split()) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { @@ -1804,18 +1873,6 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (vrm_overlaps(vrm)) return -EINVAL; - /* - * We may unmap twice before invoking move_vma(), that is if new_len < - * old_len (shrinking), and in the MREMAP_FIXED case, unmapping part of - * a VMA located at the destination. - * - * In the worst case, both unmappings will cause splits, resulting in a - * net increased map count of 2. In move_vma() we check for headroom of - * 2 additional mappings, so check early to avoid bailing out then. - */ - if (current->mm->map_count + 4 > get_sysctl_max_map_count()) - return -ENOMEM; - return 0; } @@ -1925,6 +1982,11 @@ static unsigned long do_mremap(struct vma_remap_struct *vrm) return -EINTR; vrm->mmap_locked = true; + if (!check_map_count_against_split_early()) { + mmap_write_unlock(mm); + return -ENOMEM; + } + if (vrm_move_only(vrm)) { res = remap_move(vrm); } else { From c63067e8b08495550be09afa646f9bf101762d0e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Mar 2026 22:29:22 -0700 Subject: [PATCH 244/369] mm/damon/core: fix wrong end address assignment on walk_system_ram() Patch series "mm/damon: support addr_unit on default monitoring targets for modules". DAMON_RECLAIM and DAMON_LRU_SORT support 'addr_unit' parameters only when the monitoring target address range is explicitly set. This was intentional for making the initial 'addr_unit' support change small. Now 'addr_unit' support is being quite stabilized. Having the corner case of the support is only making the code inconsistent with implicit rules. The inconsistency makes it easy to confuse [1] readers. After all, there is no real reason to keep 'addr_unit' support incomplete. Add the support for the case to improve the readability and more completely support 'addr_unit'. This series is constructed with five patches. The first one (patch 1) fixes a small bug that mistakenly assigns inclusive end address to open end address, which was found from this work. The second and third ones (patches 2 and 3) extend the default monitoring target setting functions in the core layer one by one, to support the 'addr_unit' while making no visible changes. The final two patches (patches 4 and 5) update DAMON_RECLAIM and DAMON_LRU_SORT to support 'addr_unit' for the default monitoring target address ranges, by passing the user input to the core functions. This patch (of 5): 'struct damon_addr_range' and 'struct resource' represent different types of address ranges. 'damon_addr_range' is for end-open ranges ([start, end)). 'resource' is for fully-closed ranges ([start, end]). But walk_system_ram() is assigning resource->end to damon_addr_range->end without the inclusiveness adjustment. As a result, the function returns an address range that is missing the last one byte. The function is being used to find and set the biggest system ram as the default monitoring target for DAMON_RECLAIM and DAMON_LRU_SORT. Missing the last byte of the big range shouldn't be a real problem for the real use cases. That said, the loss is definitely an unintended behavior. Do the correct adjustment. Link: https://lkml.kernel.org/r/20260311052927.93921-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260311052927.93921-2-sj@kernel.org Link: https://lore.kernel.org/20260131015643.79158-1-sj@kernel.org [1] Fixes: 43b0536cb471 ("mm/damon: introduce DAMON-based Reclamation (DAMON_RECLAIM)") Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index b543d1202c9d..ce791b544b5d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3068,7 +3068,7 @@ static int walk_system_ram(struct resource *res, void *arg) if (a->end - a->start < resource_size(res)) { a->start = res->start; - a->end = res->end; + a->end = res->end + 1; } return 0; } From b47dcc1a28ccd5a175549055b7bc7a68a444ee92 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Mar 2026 22:29:23 -0700 Subject: [PATCH 245/369] mm/damon/core: support addr_unit on damon_find_biggest_system_ram() damon_find_biggest_system_ram() sets an 'unsigned long' variable with 'resource_size_t' value. This is fundamentally wrong. On environments such as ARM 32 bit machines having LPAE (Large Physical Address Extensions), which DAMON supports, the size of 'unsigned long' may be smaller than that of 'resource_size_t'. It is safe, though, since we restrict the walk to be done only up to ULONG_MAX. DAMON supports the address size gap using 'addr_unit'. We didn't add the support to the function, just to make the initial support change small. Now the support is reasonably settled. This kind of gap is only making the code inconsistent and easy to be confused. Add the support of 'addr_unit' to the function, by letting callers pass the 'addr_unit' and handling it in the function. All callers are passing 'addr_unit' 1, though, to keep the old behavior. [sj@kernel.org: verify found biggest system ram] Link: https://lkml.kernel.org/r/20260317144725.88524-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260311052927.93921-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- mm/damon/core.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index ce791b544b5d..f5f46ba5d537 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3064,31 +3064,43 @@ done: static int walk_system_ram(struct resource *res, void *arg) { - struct damon_addr_range *a = arg; + struct resource *a = arg; - if (a->end - a->start < resource_size(res)) { + if (resource_size(a) < resource_size(res)) { a->start = res->start; - a->end = res->end + 1; + a->end = res->end; } return 0; } +static unsigned long damon_res_to_core_addr(resource_size_t ra, + unsigned long addr_unit) +{ + /* + * Use div_u64() for avoiding linking errors related with __udivdi3, + * __aeabi_uldivmod, or similar problems. This should also improve the + * performance optimization (read div_u64() comment for the detail). + */ + if (sizeof(ra) == 8 && sizeof(addr_unit) == 4) + return div_u64(ra, addr_unit); + return ra / addr_unit; +} + /* * Find biggest 'System RAM' resource and store its start and end address in * @start and @end, respectively. If no System RAM is found, returns false. */ static bool damon_find_biggest_system_ram(unsigned long *start, - unsigned long *end) + unsigned long *end, unsigned long addr_unit) { - struct damon_addr_range arg = {}; + struct resource res = {}; - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) + walk_system_ram_res(0, -1, &res, walk_system_ram); + *start = damon_res_to_core_addr(res.start, addr_unit); + *end = damon_res_to_core_addr(res.end + 1, addr_unit); + if (*end <= *start) return false; - - *start = arg.start; - *end = arg.end; return true; } @@ -3118,7 +3130,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return -EINVAL; if (!*start && !*end && - !damon_find_biggest_system_ram(start, end)) + !damon_find_biggest_system_ram(start, end, 1)) return -EINVAL; addr_range.start = *start; From eabc2eddb2767e0ed90f98a65744bf4c8e287db7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Mar 2026 22:29:24 -0700 Subject: [PATCH 246/369] mm/damon/core: receive addr_unit on damon_set_region_biggest_system_ram_default() damon_find_biggest_system_ram() was not supporting addr_unit in the past. Hence, its caller, damon_set_region_biggest_system_ram_default(), was also not supporting addr_unit. The previous commit has updated the inner function to support addr_unit. There is no more reason to not support addr_unit on damon_set_region_biggest_system_ram_default(). Rather, it makes unnecessary inconsistency on support of addr_unit. Update it to receive addr_unit and handle it inside. Link: https://lkml.kernel.org/r/20260311052927.93921-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 7 ++++--- mm/damon/lru_sort.c | 1 + mm/damon/reclaim.c | 1 + 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index e44e2132ccaf..d9a3babbafc1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -994,6 +994,7 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, + unsigned long addr_unit, unsigned long min_region_sz); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index f5f46ba5d537..01c892a1dcd2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3110,6 +3110,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @addr_unit: The address unit for the damon_ctx of @t. * @min_region_sz: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the @@ -3122,7 +3123,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, - unsigned long min_region_sz) + unsigned long addr_unit, unsigned long min_region_sz) { struct damon_addr_range addr_range; @@ -3130,12 +3131,12 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return -EINVAL; if (!*start && !*end && - !damon_find_biggest_system_ram(start, end, 1)) + !damon_find_biggest_system_ram(start, end, addr_unit)) return -EINVAL; addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, min_region_sz); + return damon_set_regions(t, &addr_range, addr_unit, min_region_sz); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 7bc5c0b2aea3..133ea17e258d 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -345,6 +345,7 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 43d76f5bed44..01f2f6cdbcdf 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -251,6 +251,7 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, + param_ctx->addr_unit, param_ctx->min_region_sz); if (err) goto out; From 5f9a5926b75c12ac01b9c2eae98c43939272632b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 13 Mar 2026 16:49:08 -0700 Subject: [PATCH 247/369] mm/damon/core: fix wrong damon_set_regions() argument The third argument is the length of the second parameter. But addr_unit is wrongly being passed. Fix it. Link: https://lkml.kernel.org/r/20260314001854.79623-1-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 01c892a1dcd2..f342bee002dc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -3136,7 +3136,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, addr_unit, min_region_sz); + return damon_set_regions(t, &addr_range, 1, min_region_sz); } /* From fdfcda8d08df9055bc7e081a919ee0463c3a926f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Mar 2026 22:29:25 -0700 Subject: [PATCH 248/369] mm/damon/reclaim: respect addr_unit on default monitoring region setup In the past, damon_set_region_biggest_system_ram_default(), which is the core function for setting the default monitoring target region of DAMON_RECLAIM, didn't support addr_unit. Hence DAMON_RECLAIM was silently ignoring the user input for addr_unit when the user doesn't explicitly set the monitoring target regions, and therefore the default target region is being used. No real problem from that ignorance was reported so far. But, the implicit rule is only making things confusing. Also, the default target region setup function is updated to support addr_unit. Hence there is no reason to keep ignoring it. Respect the user-passed addr_unit for the default target monitoring region use case. Link: https://lkml.kernel.org/r/20260311052927.93921-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 01f2f6cdbcdf..86da14778658 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -201,12 +201,6 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; - /* - * If monitor_region_start/end are unset, always silently - * reset addr_unit to 1. - */ - if (!monitor_region_start && !monitor_region_end) - addr_unit = 1; param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); From 8c6a765f4ad7f95bb0288637d9fafbf630871838 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 10 Mar 2026 22:29:26 -0700 Subject: [PATCH 249/369] mm/damon/lru_sort: respect addr_unit on default monitoring region setup In the past, damon_set_region_biggest_system_ram_default(), which is the core function for setting the default monitoring target region of DAMON_LRU_SORT, didn't support addr_unit. Hence DAMON_LRU_SORT was silently ignoring the user input for addr_unit when the user doesn't explicitly set the monitoring target regions, and therefore the default target region is being used. No real problem from that ignorance was reported so far. But, the implicit rule is only making things confusing. Also, the default target region setup function is updated to support addr_unit. Hence there is no reason to keep ignoring it. Respect the user-passed addr_unit for the default target monitoring region use case. Link: https://lkml.kernel.org/r/20260311052927.93921-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yang yingliang Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 133ea17e258d..554559d72976 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -291,12 +291,6 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - /* - * If monitor_region_start/end are unset, always silently - * reset addr_unit to 1. - */ - if (!monitor_region_start && !monitor_region_end) - addr_unit = 1; param_ctx->addr_unit = addr_unit; param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); From 0217c7fb4de4a40cee667eb21901f3204effe5ac Mon Sep 17 00:00:00 2001 From: Jianhui Zhou Date: Tue, 10 Mar 2026 19:05:26 +0800 Subject: [PATCH 250/369] mm/userfaultfd: fix hugetlb fault mutex hash calculation In mfill_atomic_hugetlb(), linear_page_index() is used to calculate the page index for hugetlb_fault_mutex_hash(). However, linear_page_index() returns the index in PAGE_SIZE units, while hugetlb_fault_mutex_hash() expects the index in huge page units. This mismatch means that different addresses within the same huge page can produce different hash values, leading to the use of different mutexes for the same huge page. This can cause races between faulting threads, which can corrupt the reservation map and trigger the BUG_ON in resv_map_release(). Fix this by introducing hugetlb_linear_page_index(), which returns the page index in huge page granularity, and using it in place of linear_page_index(). Link: https://lkml.kernel.org/r/20260310110526.335749-1-jianhuizzzzz@gmail.com Fixes: a08c7193e4f1 ("mm/filemap: remove hugetlb special casing in filemap.c") Signed-off-by: Jianhui Zhou Reported-by: syzbot+f525fd79634858f478e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f525fd79634858f478e7 Acked-by: SeongJae Park Reviewed-by: David Hildenbrand (Arm) Acked-by: Mike Rapoport (Microsoft) Cc: Jane Chu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: JonasZhou Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Cc: SeongJae Park Cc: Sidhartha Kumar Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 17 +++++++++++++++++ mm/userfaultfd.c | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index aaf3d472e6b5..9c098a02a09e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -792,6 +792,23 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +/** + * hugetlb_linear_page_index() - linear_page_index() but in hugetlb + * page size granularity. + * @vma: the hugetlb VMA + * @address: the virtual address within the VMA + * + * Return: the page offset within the mapping in huge page units. + */ +static inline pgoff_t hugetlb_linear_page_index(struct vm_area_struct *vma, + unsigned long address) +{ + struct hstate *h = hstate_vma(vma); + + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + static inline bool order_is_gigantic(unsigned int order) { return order > MAX_PAGE_ORDER; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e19872e51878..2c565c7134b6 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -573,7 +573,7 @@ retry: * in the case of shared pmds. fault mutex prevents * races with other faulting threads. */ - idx = linear_page_index(dst_vma, dst_addr); + idx = hugetlb_linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); From a91fd9f710490a89713823be3e7790ac59a085f8 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:18 -0600 Subject: [PATCH 251/369] mm: consolidate anonymous folio PTE mapping into helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: khugepaged cleanups and mTHP prerequisites", v4. The following series contains cleanups and prerequisites for my work on khugepaged mTHP support [1]. These have been separated out to ease review. The first patch in the series refactors the page fault folio to pte mapping and follows a similar convention as defined by map_anon_folio_pmd_(no)pf(). This not only cleans up the current implementation of do_anonymous_page(), but will allow for reuse later in the khugepaged mTHP implementation. The second patch adds a small is_pmd_order() helper to check if an order is the PMD order. This check is open-coded in a number of places. This patch aims to clean this up and will be used more in the khugepaged mTHP work. The third patch also adds a small DEFINE for (HPAGE_PMD_NR - 1) which is used often across the khugepaged code. The fourth and fifth patch come from the khugepaged mTHP patchset [1]. These two patches include the rename of function prefixes, and the unification of khugepaged and madvise_collapse via a new collapse_single_pmd function. Patch 1: refactor do_anonymous_page into map_anon_folio_pte_(no)pf Patch 2: add is_pmd_order helper Patch 3: Add define for (HPAGE_PMD_NR - 1) Patch 4: Refactor/rename hpage_collapse Patch 5: Refactoring to combine madvise_collapse and khugepaged A big thanks to everyone that has reviewed, tested, and participated in the development process. This patch (of 5): The anonymous page fault handler in do_anonymous_page() open-codes the sequence to map a newly allocated anonymous folio at the PTE level: - construct the PTE entry - add rmap - add to LRU - set the PTEs - update the MMU cache. Introduce two helpers to consolidate this duplicated logic, mirroring the existing map_anon_folio_pmd_nopf() pattern for PMD-level mappings: map_anon_folio_pte_nopf(): constructs the PTE entry, takes folio references, adds anon rmap and LRU. This function also handles the uffd_wp that can occur in the pf variant. The future khugepaged mTHP code calls this to handle mapping the new collapsed mTHP to its folio. map_anon_folio_pte_pf(): extends the nopf variant to handle MM_ANONPAGES counter updates, and mTHP fault allocation statistics for the page fault path. The zero-page read path in do_anonymous_page() is also untangled from the shared setpte label, since it does not allocate a folio and should not share the same mapping sequence as the write path. We can now leave nr_pages undeclared at the function intialization, and use the single page update_mmu_cache function to handle the zero page update. This refactoring will also help reduce code duplication between mm/memory.c and mm/khugepaged.c, and provides a clean API for PTE-level anonymous folio mapping that can be reused by future callers (like khugpeaged mTHP support) Link: https://lkml.kernel.org/r/20260325114022.444081-1-npache@redhat.com Link: https://lkml.kernel.org/r/20260325114022.444081-2-npache@redhat.com Link: https://lore.kernel.org/all/20260122192841.128719-1-npache@redhat.com Signed-off-by: Nico Pache Suggested-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Dev Jain Reviewed-by: Lance Yang Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Pedro Falcato Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Cc: Zi Yan Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +++ mm/memory.c | 61 +++++++++++++++++++++++++++++++--------------- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 25ba5816e02b..16a1ad9a3397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4916,4 +4916,8 @@ static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps) void snapshot_page(struct page_snapshot *ps, const struct page *page); +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp); + #endif /* _LINUX_MM_H */ diff --git a/mm/memory.c b/mm/memory.c index f21c804b50bf..7c350a38fecf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5197,6 +5197,37 @@ fallback: return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } +void map_anon_folio_pte_nopf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, + bool uffd_wp) +{ + const unsigned int nr_pages = folio_nr_pages(folio); + pte_t entry = folio_mk_pte(folio, vma->vm_page_prot); + + entry = pte_sw_mkyoung(entry); + + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry), vma); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + folio_ref_add(folio, nr_pages - 1); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_ptes(vma->vm_mm, addr, pte, entry, nr_pages); + update_mmu_cache_range(NULL, vma, addr, pte, nr_pages); +} + +static void map_anon_folio_pte_pf(struct folio *folio, pte_t *pte, + struct vm_area_struct *vma, unsigned long addr, bool uffd_wp) +{ + const unsigned int order = folio_order(folio); + + map_anon_folio_pte_nopf(folio, pte, vma, addr, uffd_wp); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1L << order); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -5208,7 +5239,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; - int nr_pages = 1; + int nr_pages; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -5243,7 +5274,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } - goto setpte; + if (vmf_orig_pte_uffd_wp(vmf)) + entry = pte_mkuffd_wp(entry); + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, vmf->pte); + goto unlock; } /* Allocate our own private page. */ @@ -5267,11 +5304,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = folio_mk_pte(folio, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry), vma); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; @@ -5293,19 +5325,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } - - folio_ref_add(folio, nr_pages - 1); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); - folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); -setpte: - if (vmf_orig_pte_uffd_wp(vmf)) - entry = pte_mkuffd_wp(entry); - set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); + map_anon_folio_pte_pf(folio, vmf->pte, vma, addr, + vmf_orig_pte_uffd_wp(vmf)); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); From b90c453d2664ba445383956560581f9db708584f Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:19 -0600 Subject: [PATCH 252/369] mm: introduce is_pmd_order helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to add mTHP support to khugepaged, we will often be checking if a given order is (or is not) a PMD order. Some places in the kernel already use this check, so lets create a simple helper function to keep the code clean and readable. Link: https://lkml.kernel.org/r/20260325114022.444081-3-npache@redhat.com Signed-off-by: Nico Pache Acked-by: David Hildenbrand (Arm) Reviewed-by: Baolin Wang Reviewed-by: Dev Jain Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Zi Yan Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Suggested-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ mm/huge_memory.c | 2 +- mm/khugepaged.c | 6 +++--- mm/memory.c | 2 +- mm/mempolicy.c | 2 +- mm/page_alloc.c | 4 ++-- mm/shmem.c | 3 +-- 7 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a4d9f964dfde..bd7f0e1d8094 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -771,6 +771,11 @@ static inline bool pmd_is_huge(pmd_t pmd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool is_pmd_order(unsigned int order) +{ + return order == HPAGE_PMD_ORDER; +} + static inline int split_folio_to_list_to_order(struct folio *folio, struct list_head *list, int new_order) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9fea52ccad56..1c1a7cf7b209 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4159,7 +4159,7 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - if (old_order == HPAGE_PMD_ORDER) + if (is_pmd_order(old_order)) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); return ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f972a9a65e3a..c6a5d9d1f252 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1540,7 +1540,7 @@ static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsign if (IS_ERR(folio)) return SCAN_PAGE_NULL; - if (folio_order(folio) != HPAGE_PMD_ORDER) { + if (!is_pmd_order(folio_order(folio))) { result = SCAN_PAGE_COMPOUND; goto drop_folio; } @@ -2023,7 +2023,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (folio_order(folio) == HPAGE_PMD_ORDER) { + if (is_pmd_order(folio_order(folio))) { result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } @@ -2351,7 +2351,7 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, continue; } - if (folio_order(folio) == HPAGE_PMD_ORDER) { + if (is_pmd_order(folio_order(folio))) { result = SCAN_PTE_MAPPED_HUGEPAGE; /* * PMD-sized THP implies that we can only try diff --git a/mm/memory.c b/mm/memory.c index 7c350a38fecf..6d54e5ec82f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5435,7 +5435,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; - if (folio_order(folio) != HPAGE_PMD_ORDER) + if (!is_pmd_order(folio_order(folio))) return ret; page = &folio->page; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e5175f1c767..e5528c35bbb8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2449,7 +2449,7 @@ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ - order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { + is_pmd_order(order) && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 937e9b850709..cdde59e56a55 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -651,7 +651,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool movable; if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(order != HPAGE_PMD_ORDER); + VM_BUG_ON(!is_pmd_order(order)); movable = migratetype == MIGRATE_MOVABLE; @@ -683,7 +683,7 @@ static inline bool pcp_allowed_order(unsigned int order) if (order <= PAGE_ALLOC_COSTLY_ORDER) return true; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order == HPAGE_PMD_ORDER) + if (is_pmd_order(order)) return true; #endif return false; diff --git a/mm/shmem.c b/mm/shmem.c index 5e7dcf5bc5d3..6fa1e8340c93 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5558,8 +5558,7 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ - if (shmem_huge == SHMEM_HUGE_FORCE && - order != HPAGE_PMD_ORDER) + if (shmem_huge == SHMEM_HUGE_FORCE && !is_pmd_order(order)) return -EINVAL; spin_lock(&huge_shmem_orders_lock); From 36da8a88fd3b519fd31b2f31bbea1b189df6cd8c Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:20 -0600 Subject: [PATCH 253/369] mm/khugepaged: define KHUGEPAGED_MAX_PTES_LIMIT as HPAGE_PMD_NR - 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value (HPAGE_PMD_NR - 1) is used often in the khugepaged code to signify the limit of the max_ptes_* values. Add a define for this to increase code readability and reuse. Link: https://lkml.kernel.org/r/20260325114022.444081-4-npache@redhat.com Signed-off-by: Nico Pache Acked-by: Pedro Falcato Acked-by: David Hildenbrand (Arm) Suggested-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Reviewed-by: Zi Yan Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Dev Jain Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/khugepaged.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c6a5d9d1f252..322964d07c1d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -89,6 +89,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * * Note that these are only respected if collapse was initiated by khugepaged. */ +#define KHUGEPAGED_MAX_PTES_LIMIT (HPAGE_PMD_NR - 1) unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; @@ -259,7 +260,7 @@ static ssize_t max_ptes_none_store(struct kobject *kobj, unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); - if (err || max_ptes_none > HPAGE_PMD_NR - 1) + if (err || max_ptes_none > KHUGEPAGED_MAX_PTES_LIMIT) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; @@ -284,7 +285,7 @@ static ssize_t max_ptes_swap_store(struct kobject *kobj, unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); - if (err || max_ptes_swap > HPAGE_PMD_NR - 1) + if (err || max_ptes_swap > KHUGEPAGED_MAX_PTES_LIMIT) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; @@ -310,7 +311,7 @@ static ssize_t max_ptes_shared_store(struct kobject *kobj, unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); - if (err || max_ptes_shared > HPAGE_PMD_NR - 1) + if (err || max_ptes_shared > KHUGEPAGED_MAX_PTES_LIMIT) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; @@ -382,7 +383,7 @@ int __init khugepaged_init(void) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; - khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; + khugepaged_max_ptes_none = KHUGEPAGED_MAX_PTES_LIMIT; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; From ff7e03a87169d3c2b05f86e7e96456ab62e6cbb1 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:21 -0600 Subject: [PATCH 254/369] mm/khugepaged: rename hpage_collapse_* to collapse_* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hpage_collapse functions describe functions used by madvise_collapse and khugepaged. remove the unnecessary hpage prefix to shorten the function name. Link: https://lkml.kernel.org/r/20260325114022.444081-5-npache@redhat.com Signed-off-by: Nico Pache Reviewed-by: Dev Jain Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Liam R. Howlett Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Lorenzo Stoakes (Oracle) Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Pedro Falcato Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/khugepaged.c | 60 ++++++++++++++++++++++++------------------------- mm/mremap.c | 2 +- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 322964d07c1d..c99e33094963 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -395,14 +395,14 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline int hpage_collapse_test_exit(struct mm_struct *mm) +static inline int collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } -static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) +static inline int collapse_test_exit_or_disable(struct mm_struct *mm) { - return hpage_collapse_test_exit(mm) || + return collapse_test_exit(mm) || mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } @@ -436,7 +436,7 @@ void __khugepaged_enter(struct mm_struct *mm) int wakeup; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); + VM_BUG_ON_MM(collapse_test_exit(mm), mm); if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; @@ -490,7 +490,7 @@ void __khugepaged_exit(struct mm_struct *mm) } else if (slot) { /* * This is required to serialize against - * hpage_collapse_test_exit() (which is guaranteed to run + * collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. @@ -589,7 +589,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - /* See hpage_collapse_scan_pmd(). */ + /* See collapse_scan_pmd(). */ if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && @@ -840,7 +840,7 @@ static struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; -static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) +static bool collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -875,7 +875,7 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int hpage_collapse_find_target_node(struct collapse_control *cc) +static int collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; @@ -894,7 +894,7 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) return target_node; } #else -static int hpage_collapse_find_target_node(struct collapse_control *cc) +static int collapse_find_target_node(struct collapse_control *cc) { return 0; } @@ -913,7 +913,7 @@ static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned l enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -984,7 +984,7 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm, /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if hpage_collapse_scan_pmd believes it is worthwhile. + * Only done if khugepaged_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. @@ -1070,7 +1070,7 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); - int node = hpage_collapse_find_target_node(cc); + int node = collapse_find_target_node(cc); struct folio *folio; folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); @@ -1248,7 +1248,7 @@ out_nolock: return result; } -static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, +static enum scan_result collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { @@ -1373,7 +1373,7 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, * hit record. */ node = folio_nid(folio); - if (hpage_collapse_scan_abort(node, cc)) { + if (collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } @@ -1439,7 +1439,7 @@ static void collect_mm_slot(struct mm_slot *slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -1794,7 +1794,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; - if (hpage_collapse_test_exit(mm)) + if (collapse_test_exit(mm)) continue; if (!file_backed_vma_is_retractable(vma)) @@ -2310,7 +2310,7 @@ out: return result; } -static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, +static enum scan_result collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { @@ -2363,7 +2363,7 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, } node = folio_nid(folio); - if (hpage_collapse_scan_abort(node, cc)) { + if (collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; folio_put(folio); break; @@ -2417,7 +2417,7 @@ static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, return result; } -static void khugepaged_scan_mm_slot(unsigned int progress_max, +static void collapse_scan_mm_slot(unsigned int progress_max, enum scan_result *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) @@ -2451,7 +2451,7 @@ static void khugepaged_scan_mm_slot(unsigned int progress_max, goto breakouterloop_mmap_lock; cc->progress++; - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); @@ -2459,7 +2459,7 @@ static void khugepaged_scan_mm_slot(unsigned int progress_max, unsigned long hstart, hend; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { + if (unlikely(collapse_test_exit_or_disable(mm))) { cc->progress++; break; } @@ -2481,7 +2481,7 @@ static void khugepaged_scan_mm_slot(unsigned int progress_max, bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2494,12 +2494,12 @@ static void khugepaged_scan_mm_slot(unsigned int progress_max, mmap_read_unlock(mm); mmap_locked = false; - *result = hpage_collapse_scan_file(mm, + *result = collapse_scan_file(mm, khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit_or_disable(mm)) + if (collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = try_collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); @@ -2508,7 +2508,7 @@ static void khugepaged_scan_mm_slot(unsigned int progress_max, mmap_read_unlock(mm); } } else { - *result = hpage_collapse_scan_pmd(mm, vma, + *result = collapse_scan_pmd(mm, vma, khugepaged_scan.address, &mmap_locked, cc); } @@ -2540,7 +2540,7 @@ breakouterloop_mmap_lock: * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm, or THP got disabled. */ - if (hpage_collapse_test_exit_or_disable(mm) || !vma) { + if (collapse_test_exit_or_disable(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2593,7 +2593,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) - khugepaged_scan_mm_slot(progress_max, &result, cc); + collapse_scan_mm_slot(progress_max, &result, cc); else cc->progress = progress_max; spin_unlock(&khugepaged_mm_lock); @@ -2838,8 +2838,7 @@ retry: mmap_read_unlock(mm); mmap_locked = false; *lock_dropped = true; - result = hpage_collapse_scan_file(mm, addr, file, pgoff, - cc); + result = collapse_scan_file(mm, addr, file, pgoff, cc); if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && mapping_can_writeback(file->f_mapping)) { @@ -2853,8 +2852,7 @@ retry: } fput(file); } else { - result = hpage_collapse_scan_pmd(mm, vma, addr, - &mmap_locked, cc); + result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); } if (!mmap_locked) *lock_dropped = true; diff --git a/mm/mremap.c b/mm/mremap.c index ee46bbb031e6..36b3f1caebad 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -244,7 +244,7 @@ static int move_ptes(struct pagetable_move_control *pmc, goto out; } /* - * Now new_pte is none, so hpage_collapse_scan_file() path can not find + * Now new_pte is none, so collapse_scan_file() path can not find * this by traversing file->f_mapping, so there is no concurrency with * retract_page_tables(). In addition, we already hold the exclusive * mmap_lock, so this new_pte page is stable, so there is no need to get From a155d945b73c5b0668e898df5495afe45bb261cd Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 25 Mar 2026 05:40:22 -0600 Subject: [PATCH 255/369] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The khugepaged daemon and madvise_collapse have two different implementations that do almost the same thing. Create collapse_single_pmd to increase code reuse and create an entry point to these two users. Refactor madvise_collapse and collapse_scan_mm_slot to use the new collapse_single_pmd function. To help reduce confusion around the mmap_locked variable, we rename mmap_locked to lock_dropped in the collapse_scan_mm_slot() function, and remove the redundant mmap_locked in madvise_collapse(); this further unifies the code readiblity. the SCAN_PTE_MAPPED_HUGEPAGE enum is no longer reachable in the madvise_collapse() function, so we drop it from the list of "continuing" enums. This introduces a minor behavioral change that is most likely an undiscovered bug. The current implementation of khugepaged tests collapse_test_exit_or_disable() before calling collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse case. By unifying these two callers madvise_collapse now also performs this check. We also modify the return value to be SCAN_ANY_PROCESS which properly indicates that this process is no longer valid to operate on. By moving the madvise_collapse writeback-retry logic into the helper function we can also avoid having to revalidate the VMA. We guard the khugepaged_pages_collapsed variable to ensure its only incremented for khugepaged. As requested we also convert a VM_BUG_ON to a VM_WARN_ON. Link: https://lkml.kernel.org/r/20260325114022.444081-6-npache@redhat.com Signed-off-by: Nico Pache Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lance Yang Reviewed-by: Baolin Wang Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Rientjes Cc: Dev Jain Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Pedro Falcato Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 142 ++++++++++++++++++++++++------------------------ 1 file changed, 72 insertions(+), 70 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c99e33094963..d21348b85a59 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1250,7 +1250,7 @@ out_nolock: static enum scan_result collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, - bool *mmap_locked, struct collapse_control *cc) + bool *lock_dropped, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; @@ -1425,7 +1425,7 @@ out_unmap: result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - *mmap_locked = false; + *lock_dropped = true; } out: trace_mm_khugepaged_scan_pmd(mm, folio, referenced, @@ -2417,6 +2417,67 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm, return result; } +/* + * Try to collapse a single PMD starting at a PMD aligned addr, and return + * the results. + */ +static enum scan_result collapse_single_pmd(unsigned long addr, + struct vm_area_struct *vma, bool *lock_dropped, + struct collapse_control *cc) +{ + struct mm_struct *mm = vma->vm_mm; + bool triggered_wb = false; + enum scan_result result; + struct file *file; + pgoff_t pgoff; + + mmap_assert_locked(mm); + + if (vma_is_anonymous(vma)) { + result = collapse_scan_pmd(mm, vma, addr, lock_dropped, cc); + goto end; + } + + file = get_file(vma->vm_file); + pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + *lock_dropped = true; +retry: + result = collapse_scan_file(mm, addr, file, pgoff, cc); + + /* + * For MADV_COLLAPSE, when encountering dirty pages, try to writeback, + * then retry the collapse one time. + */ + if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK && + !triggered_wb && mapping_can_writeback(file->f_mapping)) { + const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; + const loff_t lend = lstart + HPAGE_PMD_SIZE - 1; + + filemap_write_and_wait_range(file->f_mapping, lstart, lend); + triggered_wb = true; + goto retry; + } + fput(file); + + if (result == SCAN_PTE_MAPPED_HUGEPAGE) { + mmap_read_lock(mm); + if (collapse_test_exit_or_disable(mm)) + result = SCAN_ANY_PROCESS; + else + result = try_collapse_pte_mapped_thp(mm, addr, + !cc->is_khugepaged); + if (result == SCAN_PMD_MAPPED) + result = SCAN_SUCCEED; + mmap_read_unlock(mm); + } +end: + if (cc->is_khugepaged && result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; + return result; +} + static void collapse_scan_mm_slot(unsigned int progress_max, enum scan_result *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) @@ -2478,46 +2539,21 @@ static void collapse_scan_mm_slot(unsigned int progress_max, VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - bool mmap_locked = true; + bool lock_dropped = false; cond_resched(); if (unlikely(collapse_test_exit_or_disable(mm))) goto breakouterloop; - VM_BUG_ON(khugepaged_scan.address < hstart || + VM_WARN_ON_ONCE(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (!vma_is_anonymous(vma)) { - struct file *file = get_file(vma->vm_file); - pgoff_t pgoff = linear_page_index(vma, - khugepaged_scan.address); - - mmap_read_unlock(mm); - mmap_locked = false; - *result = collapse_scan_file(mm, - khugepaged_scan.address, file, pgoff, cc); - fput(file); - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { - mmap_read_lock(mm); - if (collapse_test_exit_or_disable(mm)) - goto breakouterloop; - *result = try_collapse_pte_mapped_thp(mm, - khugepaged_scan.address, false); - if (*result == SCAN_PMD_MAPPED) - *result = SCAN_SUCCEED; - mmap_read_unlock(mm); - } - } else { - *result = collapse_scan_pmd(mm, vma, - khugepaged_scan.address, &mmap_locked, cc); - } - - if (*result == SCAN_SUCCEED) - ++khugepaged_pages_collapsed; + *result = collapse_single_pmd(khugepaged_scan.address, + vma, &lock_dropped, cc); /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; - if (!mmap_locked) + if (lock_dropped) /* * We released mmap_lock so break loop. Note * that we drop mmap_lock before all hugepage @@ -2792,7 +2828,6 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, unsigned long hstart, hend, addr; enum scan_result last_fail = SCAN_FAIL; int thps = 0; - bool mmap_locked = true; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); @@ -2814,13 +2849,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { enum scan_result result = SCAN_FAIL; - bool triggered_wb = false; -retry: - if (!mmap_locked) { + if (*lock_dropped) { cond_resched(); mmap_read_lock(mm); - mmap_locked = true; + *lock_dropped = false; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { @@ -2830,45 +2863,14 @@ retry: hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } - mmap_assert_locked(mm); - if (!vma_is_anonymous(vma)) { - struct file *file = get_file(vma->vm_file); - pgoff_t pgoff = linear_page_index(vma, addr); - mmap_read_unlock(mm); - mmap_locked = false; - *lock_dropped = true; - result = collapse_scan_file(mm, addr, file, pgoff, cc); + result = collapse_single_pmd(addr, vma, lock_dropped, cc); - if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && - mapping_can_writeback(file->f_mapping)) { - loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; - loff_t lend = lstart + HPAGE_PMD_SIZE - 1; - - filemap_write_and_wait_range(file->f_mapping, lstart, lend); - triggered_wb = true; - fput(file); - goto retry; - } - fput(file); - } else { - result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); - } - if (!mmap_locked) - *lock_dropped = true; - -handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; - case SCAN_PTE_MAPPED_HUGEPAGE: - BUG_ON(mmap_locked); - mmap_read_lock(mm); - result = try_collapse_pte_mapped_thp(mm, addr, true); - mmap_read_unlock(mm); - goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: @@ -2891,7 +2893,7 @@ handle_result: out_maybelock: /* Caller expects us to hold mmap_lock on return */ - if (!mmap_locked) + if (*lock_dropped) mmap_read_lock(mm); out_nolock: mmap_assert_locked(mm); From f0f6f787143068b23c5808e7a63aef03601f1377 Mon Sep 17 00:00:00 2001 From: gao xu Date: Fri, 13 Mar 2026 02:41:14 +0000 Subject: [PATCH 256/369] zram: optimize LZ4 dictionary compression performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling `LZ4_loadDict()` repeatedly in Zram causes significant overhead due to its internal dictionary pre-processing. This commit introduces a template stream mechanism to pre-process the dictionary only once when the dictionary is initially set or modified. It then efficiently copies this state for subsequent compressions. Verification Test Items: Test Platform: android16-6.12 1. Collect Anonymous Page Dataset 1) Apply the following patch: static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) - huge_class_size = zs_huge_class_size(zram->mem_pool); + huge_class_size = 0; 2)Install multiple apps and monkey testing until SwapFree is close to 0. 3)Execute the following command to export data: dd if=/dev/block/zram0 of=/data/samples/zram_dump.img bs=4K 2. Train Dictionary Since LZ4 does not have a dedicated dictionary training tool, the zstd tool can be used for training[1]. The command is as follows: zstd --train /data/samples/* --split=4096 --maxdict=64KB -o /vendor/etc/dict_data 3. Test Code adb shell "dd if=/data/samples/zram_dump.img of=/dev/test_pattern bs=4096 count=131072 conv=fsync" adb shell "swapoff /dev/block/zram0" adb shell "echo 1 > /sys/block/zram0/reset" adb shell "echo lz4 > /sys/block/zram0/comp_algorithm" adb shell "echo dict=/vendor/etc/dict_data > /sys/block/zram0/algorithm_params" adb shell "echo 6G > /sys/block/zram0/disksize" echo "Start Compression" adb shell "taskset 80 dd if=/dev/test_pattern of=/dev/block/zram0 bs=4096 count=131072 conv=fsync" echo. echo "Start Decompression" adb shell "taskset 80 dd if=/dev/block/zram0 of=/dev/output_result bs=4096 count=131072 conv=fsync" echo "mm_stat:" adb shell "cat /sys/block/zram0/mm_stat" echo. Note: To ensure stable test results, it is best to lock the CPU frequency before executing the test. LZ4 supports dictionaries up to 64KB. Below are the test results for compression rates at various dictionary sizes: dict_size base patch 4 KB 156M/s 219M/s 8 KB 136M/s 217M/s 16KB 98M/s 214M/s 32KB 66M/s 225M/s 64KB 38M/s 224M/s When an LZ4 compression dictionary is enabled, compression speed is negatively impacted by the dictionary's size; larger dictionaries result in slower compression. This patch eliminates the influence of dictionary size on compression speed, ensuring consistent performance regardless of dictionary scale. Link: https://lkml.kernel.org/r/698181478c9c4b10aa21b4a847bdc706@honor.com Link: https://github.com/lz4/lz4?tab=readme-ov-file [1] Signed-off-by: gao xu Acked-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- drivers/block/zram/backend_lz4.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/backend_lz4.c b/drivers/block/zram/backend_lz4.c index 04e186614760..c449d511ba86 100644 --- a/drivers/block/zram/backend_lz4.c +++ b/drivers/block/zram/backend_lz4.c @@ -14,13 +14,38 @@ struct lz4_ctx { static void lz4_release_params(struct zcomp_params *params) { + LZ4_stream_t *dict_stream = params->drv_data; + + params->drv_data = NULL; + if (!dict_stream) + return; + + kfree(dict_stream); } static int lz4_setup_params(struct zcomp_params *params) { + LZ4_stream_t *dict_stream; + int ret; + if (params->level == ZCOMP_PARAM_NOT_SET) params->level = LZ4_ACCELERATION_DEFAULT; + if (!params->dict || !params->dict_sz) + return 0; + + dict_stream = kzalloc_obj(*dict_stream, GFP_KERNEL); + if (!dict_stream) + return -ENOMEM; + + ret = LZ4_loadDict(dict_stream, + params->dict, params->dict_sz); + if (ret != params->dict_sz) { + kfree(dict_stream); + return -EINVAL; + } + params->drv_data = dict_stream; + return 0; } @@ -79,9 +104,7 @@ static int lz4_compress(struct zcomp_params *params, struct zcomp_ctx *ctx, zctx->mem); } else { /* Cstrm needs to be reset */ - ret = LZ4_loadDict(zctx->cstrm, params->dict, params->dict_sz); - if (ret != params->dict_sz) - return -EINVAL; + memcpy(zctx->cstrm, params->drv_data, sizeof(*zctx->cstrm)); ret = LZ4_compress_fast_continue(zctx->cstrm, req->src, req->dst, req->src_len, req->dst_len, params->level); From bf989ade270d4ca65e73d5fc1ab5e4d2ef472e80 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 16 Mar 2026 10:53:32 +0900 Subject: [PATCH 257/369] zram: propagate read_from_bdev_async() errors When read_from_bdev_async() fails to chain bio, for instance fails to allocate request or bio, we need to propagate the error condition so that upper layer is aware of it. zram already does that by setting BLK_STS_IOERR ->bi_status, but only for sync reads. Change async read path to return its error status so that async errors are also handled. Link: https://lkml.kernel.org/r/20260316015354.114465-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Brian Geffon Acked-by: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bab21b44bdcb..b96e40f9a9dd 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1429,21 +1429,21 @@ static void zram_async_read_endio(struct bio *bio) queue_work(system_highpri_wq, &req->work); } -static void read_from_bdev_async(struct zram *zram, struct page *page, - u32 index, unsigned long blk_idx, - struct bio *parent) +static int read_from_bdev_async(struct zram *zram, struct page *page, + u32 index, unsigned long blk_idx, + struct bio *parent) { struct zram_rb_req *req; struct bio *bio; req = kmalloc_obj(*req, GFP_NOIO); if (!req) - return; + return -ENOMEM; bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO); if (!bio) { kfree(req); - return; + return -ENOMEM; } req->zram = zram; @@ -1459,6 +1459,8 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, __bio_add_page(bio, page, PAGE_SIZE, 0); bio_inc_remaining(parent); submit_bio(bio); + + return 0; } static void zram_sync_read(struct work_struct *w) @@ -1507,8 +1509,7 @@ static int read_from_bdev(struct zram *zram, struct page *page, u32 index, return -EIO; return read_from_bdev_sync(zram, page, index, blk_idx); } - read_from_bdev_async(zram, page, index, blk_idx, parent); - return 0; + return read_from_bdev_async(zram, page, index, blk_idx, parent); } #else static inline void reset_bdev(struct zram *zram) {}; From 4bdbddb4e482f283b60787b97fc06b3da3fb419c Mon Sep 17 00:00:00 2001 From: Liew Rui Yan Date: Sun, 15 Mar 2026 09:29:44 -0700 Subject: [PATCH 258/369] Docs/mm/damon: document exclusivity of special-purpose modules Add a section in design.rst to explain that DAMON special-purpose kernel modules (LRU_SORT, RECLAIM, STAT) run in an exclusive manner and return -EBUSY if another is already running. Update lru_sort.rst, reclaim.rst and stat.rst by adding cross-references to this exclusivity rule at the end of their respective Example sections. This change is motivated from another discussion [1]. Link: https://lkml.kernel.org/r/20260315162945.80994-1-sj@kernel.org Link: https://lore.kernel.org/damon/20260314002119.79742-1-sj@kernel.org/T/#t [1] Signed-off-by: Liew Rui Yan Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 5 +++++ Documentation/admin-guide/mm/damon/reclaim.rst | 5 +++++ Documentation/admin-guide/mm/damon/stat.rst | 5 +++++ Documentation/mm/damon/design.rst | 4 ++++ 4 files changed, 19 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 73980bacc3a0..56690646cf39 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -351,3 +351,8 @@ the LRU-list based page granularity reclamation. :: # echo 400 > wmarks_mid # echo 200 > wmarks_low # echo Y > enabled + +Note that this module (damon_lru_sort) cannot run simultaneously with other +DAMON-based special-purpose modules. Refer to :ref:`DAMON design special +purpose modules exclusivity ` +for more details. diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 8eba3da8dcee..442ac5c64795 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -318,6 +318,11 @@ granularity reclamation. :: # echo 200 > wmarks_low # echo Y > enabled +Note that this module (damon_reclaim) cannot run simultaneously with other +DAMON-based special-purpose modules. Refer to :ref:`DAMON design special +purpose modules exclusivity ` +for more details. + .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ .. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/admin-guide/mm/damon/stat.rst b/Documentation/admin-guide/mm/damon/stat.rst index e5a5a2c4f803..c4b14daeb2dd 100644 --- a/Documentation/admin-guide/mm/damon/stat.rst +++ b/Documentation/admin-guide/mm/damon/stat.rst @@ -45,6 +45,11 @@ You can enable DAMON_STAT by setting the value of this parameter as ``Y``. Setting it as ``N`` disables DAMON_STAT. The default value is set by ``CONFIG_DAMON_STAT_ENABLED_DEFAULT`` build config option. +Note that this module (damon_stat) cannot run simultaneously with other +DAMON-based special-purpose modules. Refer to :ref:`DAMON design special +purpose modules exclusivity ` +for more details. + .. _damon_stat_aggr_interval_us: aggr_interval_us diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 29fff20b3c2a..dc37402c0fee 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -853,6 +853,10 @@ more detail, please read the usage documents for those (:doc:`/admin-guide/mm/damon/stat`, :doc:`/admin-guide/mm/damon/reclaim` and :doc:`/admin-guide/mm/damon/lru_sort`). +.. _damon_design_special_purpose_modules_exclusivity: + +Note that these modules currently run in an exclusive manner. If one of those +is already running, others will return ``-EBUSY`` upon start requests. Sample DAMON Modules -------------------- From cba82993308dc66403c5c3dd27712a58e6fe3aa8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Mar 2026 12:23:19 +0900 Subject: [PATCH 259/369] zram: change scan_slots to return void scan_slots_for_writeback() and scan_slots_for_recompress() work in a "best effort" fashion, if they cannot allocate memory for a new pp-slot candidate they just return and post-processing selects slots that were successfully scanned thus far. scan_slots functions never return errors and their callers never check the return status, so convert them to return void. Link: https://lkml.kernel.org/r/20260317032349.753645-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: SeongJae Park Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b96e40f9a9dd..c2afd1c34f4a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1196,9 +1196,9 @@ static int parse_mode(char *val, u32 *mode) return 0; } -static int scan_slots_for_writeback(struct zram *zram, u32 mode, - unsigned long lo, unsigned long hi, - struct zram_pp_ctl *ctl) +static void scan_slots_for_writeback(struct zram *zram, u32 mode, + unsigned long lo, unsigned long hi, + struct zram_pp_ctl *ctl) { u32 index = lo; @@ -1230,8 +1230,6 @@ next: break; index++; } - - return 0; } static ssize_t writeback_store(struct device *dev, @@ -2368,8 +2366,8 @@ static bool highest_priority_algorithm(struct zram *zram, u32 prio) return true; } -static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio, - struct zram_pp_ctl *ctl) +static void scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio, + struct zram_pp_ctl *ctl) { unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; unsigned long index; @@ -2404,8 +2402,6 @@ next: if (!ok) break; } - - return 0; } /* From 01494f713ee315e0aa2ddfe2f2760fd83c417e35 Mon Sep 17 00:00:00 2001 From: Asier Gutierrez Date: Tue, 17 Mar 2026 20:53:47 -0700 Subject: [PATCH 260/369] Docs/mm/damon/design: document DAMON actions when TRANSPARENT_HUGEPAGE is off MADV_HUGEPAGE and MADV_NOHUGEPAGE are guarded and they are not available when compiling the kernel without TRANSPARENT_HUGEPAGE option. The DAMON behaviour is to silently fail [1] in when DAMOS_HUGEPAGE or DAMOS_NOHUGEPAGE are used, but TRANSPARENT_HUGEPAGE is disabled. Update the DAMON documentation to reflect this behaviour. Link: https://lkml.kernel.org/r/20260318035349.88715-1-sj@kernel.org Link: https://lore.kernel.org/66131775-180b-4b9f-b7ce-61a3e077b6e6@huawei-partners.com/ [1] Signed-off-by: Asier Gutierrez Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index dc37402c0fee..838b14d22519 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -460,9 +460,13 @@ that supports each action are as below. - ``pageout``: Reclaim the region. Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. - Supported by ``vaddr`` and ``fvaddr`` operations set. + Supported by ``vaddr`` and ``fvaddr`` operations set. When + TRANSPARENT_HUGEPAGE is disabled, the application of the action will just + fail. - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. - Supported by ``vaddr`` and ``fvaddr`` operations set. + Supported by ``vaddr`` and ``fvaddr`` operations set. When + TRANSPARENT_HUGEPAGE is disabled, the application of the action will just + fail. - ``lru_prio``: Prioritize the region on its LRU lists. Supported by ``paddr`` operations set. - ``lru_deprio``: Deprioritize the region on its LRU lists. From 42561b341baae400ebd28540cdda7a6295eaf2a3 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 18 Mar 2026 12:38:49 +0800 Subject: [PATCH 261/369] mm/swapfile: remove duplicate include of swap_table.h Remove duplicate inclusion of swap_table.h in swapfile.c to clean up redundant code. Link: https://lkml.kernel.org/r/20260318043849.399266-1-nichen@iscas.ac.cn Signed-off-by: Chen Ni Reviewed-by: SeongJae Park Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kairui Song Cc: Kemeng Shi Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 71a7d6959f3e..802332850e24 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -48,7 +48,6 @@ #include #include "swap_table.h" #include "internal.h" -#include "swap_table.h" #include "swap.h" static void swap_range_alloc(struct swap_info_struct *si, From 89e69c7d187eca8a040d4db0e080f79b5b32163d Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:33 +0100 Subject: [PATCH 262/369] mm/memory_hotplug: fix possible race in scan_movable_pages() Patch series "mm: memory hot(un)plug and SPARSEMEM cleanups", v2. Some cleanups around memory hot(un)plug and SPARSEMEM. In essence, we can limit CONFIG_MEMORY_HOTPLUG to CONFIG_SPARSEMEM_VMEMMAP, remove some dead code, and move all the hotplug bits over to mm/sparse-vmemmap.c. Some further/related cleanups around other unnecessary code (memory hole handling and complicated usemap allocation). I have some further sparse.c cleanups lying around, and I'm planning on getting rid of bootmem_info.c entirely. This patch (of 15): If a hugetlb folio gets freed while we are in scan_movable_pages(), folio_nr_pages() could return 0, resulting in or'ing "0 - 1 = -1" to the PFN, resulting in PFN = -1. We're not holding any locks or references that would prevent that. for_each_valid_pfn() would then search for the next valid PFN, and could return a PFN that is outside of the range of the original requested range. do_migrate_page() would then try to migrate quite a big range, which is certainly undesirable. To fix it, simply test for valid folio_nr_pages() values. While at it, as PageHuge() really just does a page_folio() internally, we can just use folio_test_hugetlb() on the folio directly. scan_movable_pages() is expected to be fast, and we try to avoid taking locks or grabbing references. We cannot use folio_try_get() as that does not work for free hugetlb folios. We could grab the hugetlb_lock, but that just adds complexity. The race is unlikely to trigger in practice, so we won't be CCing stable. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-0-096addc8800d@kernel.org Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-1-096addc8800d@kernel.org Fixes: 16540dae959d ("mm/hugetlb: mm/memory_hotplug: use a folio in scan_movable_pages()") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a602310bdf33..c427967c78bb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1746,6 +1746,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, unsigned long pfn; for_each_valid_pfn(pfn, start, end) { + unsigned long nr_pages; struct page *page; struct folio *folio; @@ -1762,9 +1763,9 @@ static int scan_movable_pages(unsigned long start, unsigned long end, if (PageOffline(page) && page_count(page)) return -EBUSY; - if (!PageHuge(page)) - continue; folio = page_folio(page); + if (!folio_test_hugetlb(folio)) + continue; /* * This test is racy as we hold no reference or lock. The * hugetlb page could have been free'ed and head is no longer @@ -1774,7 +1775,11 @@ static int scan_movable_pages(unsigned long start, unsigned long end, */ if (folio_test_hugetlb_migratable(folio)) goto found; - pfn |= folio_nr_pages(folio) - 1; + nr_pages = folio_nr_pages(folio); + if (unlikely(nr_pages < 1 || nr_pages > MAX_FOLIO_NR_PAGES || + !is_power_of_2(nr_pages))) + continue; + pfn |= nr_pages - 1; } return -ENOENT; found: From 9d80de66a04606eef625cb9141b6d1d8c970dbcb Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:34 +0100 Subject: [PATCH 263/369] mm/memory_hotplug: remove for_each_valid_pfn() usage When offlining memory, we know that the memory range has no holes. Checking for valid pfns is not required. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-2-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c427967c78bb..504aa50e3c33 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1745,7 +1745,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, { unsigned long pfn; - for_each_valid_pfn(pfn, start, end) { + for (pfn = start; pfn < end; pfn++) { unsigned long nr_pages; struct page *page; struct folio *folio; @@ -1795,7 +1795,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - for_each_valid_pfn(pfn, start_pfn, end_pfn) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page; page = pfn_to_page(pfn); From e66383b6746d226757f1db94633ca0d6c70d7c58 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:35 +0100 Subject: [PATCH 264/369] mm/sparse: remove WARN_ONs from (online|offline)_mem_sections() We do not allow offlining of memory with memory holes, and always hotplug memory without holes. Consequently, we cannot end up onlining or offlining memory sections that have holes (including invalid sections). That's also why these WARN_ONs never fired. Let's remove the WARN_ONs along with the TODO regarding double-checking. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-3-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/sparse.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index dfabe554adf8..93252112860e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -638,13 +638,8 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms; + struct mem_section *ms = __nr_to_section(section_nr); - /* onlining code should never touch invalid ranges */ - if (WARN_ON(!valid_section_nr(section_nr))) - continue; - - ms = __nr_to_section(section_nr); ms->section_mem_map |= SECTION_IS_ONLINE; } } @@ -656,16 +651,8 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms; + struct mem_section *ms = __nr_to_section(section_nr); - /* - * TODO this needs some double checking. Offlining code makes - * sure to check pfn_valid but those checks might be just bogus - */ - if (WARN_ON(!valid_section_nr(section_nr))) - continue; - - ms = __nr_to_section(section_nr); ms->section_mem_map &= ~SECTION_IS_ONLINE; } } From fb3c3f5d27ef7c8845e3d0ac43c692216077602c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:36 +0100 Subject: [PATCH 265/369] mm/Kconfig: make CONFIG_MEMORY_HOTPLUG depend on CONFIG_SPARSEMEM_VMEMMAP Ever since commit f8f03eb5f0f9 ("mm: stop making SPARSEMEM_VMEMMAP user-selectable"), an architecture that supports CONFIG_SPARSEMEM_VMEMMAP (by selecting SPARSEMEM_VMEMMAP_ENABLE) can no longer enable CONFIG_SPARSEMEM without CONFIG_SPARSEMEM_VMEMMAP. Right now, CONFIG_MEMORY_HOTPLUG is guarded by CONFIG_SPARSEMEM. However, CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG is only enabled by * arm64: which selects SPARSEMEM_VMEMMAP_ENABLE * loongarch: which selects SPARSEMEM_VMEMMAP_ENABLE * powerpc (64bit): which selects SPARSEMEM_VMEMMAP_ENABLE * riscv (64bit): which selects SPARSEMEM_VMEMMAP_ENABLE * s390 with SPARSEMEM: which selects SPARSEMEM_VMEMMAP_ENABLE * x86 (64bit): which selects SPARSEMEM_VMEMMAP_ENABLE So, we can make CONFIG_MEMORY_HOTPLUG depend on CONFIG_SPARSEMEM_VMEMMAP without affecting any setups. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-4-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index ebd8ea353687..c012944938a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -472,7 +472,7 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE menuconfig MEMORY_HOTPLUG bool "Memory hotplug" select MEMORY_ISOLATION - depends on SPARSEMEM + depends on SPARSEMEM_VMEMMAP depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on 64BIT select NUMA_KEEP_MEMINFO if NUMA From 62257a5fb987cc6d092850ce6b70ec95a79a9442 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:37 +0100 Subject: [PATCH 266/369] mm/memory_hotplug: simplify check_pfn_span() We now always have CONFIG_SPARSEMEM_VMEMMAP, so remove the dead code. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-5-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 504aa50e3c33..8b18ddd1e7d5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -319,21 +319,13 @@ static void release_memory_resource(struct resource *res) static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) { /* - * Disallow all operations smaller than a sub-section and only - * allow operations smaller than a section for - * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() - * enforces a larger memory_block_size_bytes() granularity for - * memory that will be marked online, so this check should only - * fire for direct arch_{add,remove}_memory() users outside of - * add_memory_resource(). + * Disallow all operations smaller than a sub-section. + * Note that check_hotplug_memory_range() enforces a larger + * memory_block_size_bytes() granularity for memory that will be marked + * online, so this check should only fire for direct + * arch_{add,remove}_memory() users outside of add_memory_resource(). */ - unsigned long min_align; - - if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) - min_align = PAGES_PER_SUBSECTION; - else - min_align = PAGES_PER_SECTION; - if (!IS_ALIGNED(pfn | nr_pages, min_align)) + if (!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION)) return -EINVAL; return 0; } From 119c31caa59e84931cca713a00a58be61bfa7baa Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:38 +0100 Subject: [PATCH 267/369] mm/sparse: remove !CONFIG_SPARSEMEM_VMEMMAP leftovers for CONFIG_MEMORY_HOTPLUG CONFIG_MEMORY_HOTPLUG now depends on CONFIG_SPARSEMEM_VMEMMAP. So let's remove the !CONFIG_SPARSEMEM_VMEMMAP leftovers that are dead code. Adjust the comment above fill_subsection_map() accordingly. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-6-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/sparse.c | 69 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 67 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 93252112860e..875f718a4c79 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -657,7 +657,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } -#ifdef CONFIG_SPARSEMEM_VMEMMAP static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) @@ -729,73 +728,11 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) return rc; } -#else -static struct page * __meminit populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - return kvmalloc_node(array_size(sizeof(struct page), - PAGES_PER_SECTION), GFP_KERNEL, nid); -} - -static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - kvfree(pfn_to_page(pfn)); -} - -static void free_map_bootmem(struct page *memmap) -{ - unsigned long maps_section_nr, removing_section_nr, i; - unsigned long type, nr_pages; - struct page *page = virt_to_page(memmap); - - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - - for (i = 0; i < nr_pages; i++, page++) { - type = bootmem_type(page); - - BUG_ON(type == NODE_INFO); - - maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = bootmem_info(page); - - /* - * When this function is called, the removing section is - * logical offlined state. This means all pages are isolated - * from page allocator. If removing section's memmap is placed - * on the same section, it must not be freed. - * If it is freed, page allocator may allocate it which will - * be removed physically soon. - */ - if (maps_section_nr != removing_section_nr) - put_page_bootmem(page); - } -} - -static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - return 0; -} - -static bool is_subsection_map_empty(struct mem_section *ms) -{ - return true; -} - -static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - return 0; -} -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ /* - * To deactivate a memory region, there are 3 cases to handle across - * two configurations (SPARSEMEM_VMEMMAP={y,n}): + * To deactivate a memory region, there are 3 cases to handle: * - * 1. deactivation of a partial hot-added section (only possible in - * the SPARSEMEM_VMEMMAP=y case). + * 1. deactivation of a partial hot-added section: * a) section was present at memory init. * b) section was hot-added post memory init. * 2. deactivation of a complete hot-added section. @@ -803,8 +740,6 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) * * For 1, when subsection_map does not empty we will not be freeing the * usage map, but still need to free the vmemmap range. - * - * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified */ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) From 41293414433a0d033ef1b7f95441e347c8f513c3 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:39 +0100 Subject: [PATCH 268/369] mm/bootmem_info: remove handling for !CONFIG_SPARSEMEM_VMEMMAP It is not immediately obvious that CONFIG_HAVE_BOOTMEM_INFO_NODE is only selected from CONFIG_MEMORY_HOTREMOVE, which itself depends on CONFIG_MEMORY_HOTPLUG that ... depends on CONFIG_SPARSEMEM_VMEMMAP. Let's remove the !CONFIG_SPARSEMEM_VMEMMAP leftovers that are dead code. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-7-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index b0e2a9fa641f..e61e08e24924 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -40,42 +40,6 @@ void put_page_bootmem(struct page *page) } } -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static void __init register_page_bootmem_info_section(unsigned long start_pfn) -{ - unsigned long mapsize, section_nr, i; - struct mem_section *ms; - struct page *page, *memmap; - struct mem_section_usage *usage; - - section_nr = pfn_to_section_nr(start_pfn); - ms = __nr_to_section(section_nr); - - /* Get section's memmap address */ - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - - /* - * Get page for the memmap's phys address - * XXX: need more consideration for sparse_vmemmap... - */ - page = virt_to_page(memmap); - mapsize = sizeof(struct page) * PAGES_PER_SECTION; - mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; - - /* remember memmap's page */ - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, SECTION_INFO); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); - -} -#else /* CONFIG_SPARSEMEM_VMEMMAP */ static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; @@ -100,7 +64,6 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) { From 7f8e592bb3271ea057e84dcc480feb962ec4f161 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:40 +0100 Subject: [PATCH 269/369] mm/bootmem_info: avoid using sparse_decode_mem_map() With SPARSEMEM_VMEMMAP, we can just do a pfn_to_page(). It is not super clear whether the start_pfn is properly aligned ... so let's just make sure it is properly aligned to the start of the section. We will soon might try to remove the bootmem info completely, for now, just keep it working as is. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-8-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index e61e08e24924..3d7675a3ae04 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -44,17 +44,16 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; - struct page *page, *memmap; struct mem_section_usage *usage; + struct page *page; + start_pfn = SECTION_ALIGN_DOWN(start_pfn); section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - if (!preinited_vmemmap_section(ms)) - register_page_bootmem_memmap(section_nr, memmap, - PAGES_PER_SECTION); + register_page_bootmem_memmap(section_nr, pfn_to_page(start_pfn), + PAGES_PER_SECTION); usage = ms->usage; page = virt_to_page(usage); From 22688ade3b54b2f4f2887c7dad75db6d588ae07c Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:41 +0100 Subject: [PATCH 270/369] mm/sparse: remove sparse_decode_mem_map() section_deactivate() applies to CONFIG_SPARSEMEM_VMEMMAP only. So we can just use pfn_to_page() (after making sure we have the start PFN of the section), and remove sparse_decode_mem_map(). Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-9-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 2 -- mm/sparse.c | 16 +--------------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e77ef3d7ff73..815e908c4135 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -308,8 +308,6 @@ extern int sparse_add_section(int nid, unsigned long pfn, struct dev_pagemap *pgmap); extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, - unsigned long pnum); extern struct zone *zone_for_pfn_range(enum mmop online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages); diff --git a/mm/sparse.c b/mm/sparse.c index 875f718a4c79..b5825c9ee2f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -274,18 +274,6 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p return coded_mem_map; } -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Decode mem_map from the coded memmap - */ -struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) -{ - /* mask off the extra low bits of information */ - coded_mem_map &= SECTION_MAP_MASK; - return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ - static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, struct mem_section_usage *usage, unsigned long flags) @@ -754,8 +742,6 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, empty = is_subsection_map_empty(ms); if (empty) { - unsigned long section_nr = pfn_to_section_nr(pfn); - /* * Mark the section invalid so that valid_section() * return false. This prevents code from dereferencing @@ -774,7 +760,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, kfree_rcu(ms->usage, rcu); WRITE_ONCE(ms->usage, NULL); } - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + memmap = pfn_to_page(SECTION_ALIGN_DOWN(pfn)); } /* From dac89b150bdb32fd276a3a22fa66c481dab47dea Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:42 +0100 Subject: [PATCH 271/369] mm/sparse: remove CONFIG_MEMORY_HOTPLUG-specific usemap allocation handling In 2008, we added through commit 48c906823f39 ("memory hotplug: allocate usemap on the section with pgdat") quite some complexity to try allocating memory for the "usemap" (storing pageblock information per memory section) for a memory section close to the memory of the "pgdat" of the node. The goal was to make memory hotunplug of boot memory more likely to succeed. That commit also added some checks for circular dependencies between two memory sections, whereby two memory sections would contain each others usemap, turning both boot memory sections un-removable. However, in 2010, commit a4322e1bad91 ("sparsemem: Put usemap for one node together") started allocating the usemap for multiple memory sections on the same node in one chunk, effectively grouping all usemap allocations of the same node in a single memblock allocation. We don't really give guarantees about memory hotunplug of boot memory, and with the change in 2010, it is impossible in practice to get any circular dependencies. So let's simply remove this complexity. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-10-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/sparse.c | 100 +--------------------------------------------------- 1 file changed, 1 insertion(+), 99 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index b5825c9ee2f2..e2048b1fbf5f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -294,102 +294,6 @@ size_t mem_section_usage_size(void) return sizeof(struct mem_section_usage) + usemap_size(); } -#ifdef CONFIG_MEMORY_HOTREMOVE -static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) -{ -#ifndef CONFIG_NUMA - VM_BUG_ON(pgdat != &contig_page_data); - return __pa_symbol(&contig_page_data); -#else - return __pa(pgdat); -#endif -} - -static struct mem_section_usage * __init -sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long size) -{ - struct mem_section_usage *usage; - unsigned long goal, limit; - int nid; - /* - * A page may contain usemaps for other sections preventing the - * page being freed and making a section unremovable while - * other sections referencing the usemap remain active. Similarly, - * a pgdat can prevent a section being removed. If section A - * contains a pgdat and section B contains the usemap, both - * sections become inter-dependent. This allocates usemaps - * from the same section as the pgdat where possible to avoid - * this problem. - */ - goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); - limit = goal + (1UL << PA_SECTION_SHIFT); - nid = early_pfn_to_nid(goal >> PAGE_SHIFT); -again: - usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); - if (!usage && limit) { - limit = MEMBLOCK_ALLOC_ACCESSIBLE; - goto again; - } - return usage; -} - -static void __init check_usemap_section_nr(int nid, - struct mem_section_usage *usage) -{ - unsigned long usemap_snr, pgdat_snr; - static unsigned long old_usemap_snr; - static unsigned long old_pgdat_snr; - struct pglist_data *pgdat = NODE_DATA(nid); - int usemap_nid; - - /* First call */ - if (!old_usemap_snr) { - old_usemap_snr = NR_MEM_SECTIONS; - old_pgdat_snr = NR_MEM_SECTIONS; - } - - usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); - pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT); - if (usemap_snr == pgdat_snr) - return; - - if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) - /* skip redundant message */ - return; - - old_usemap_snr = usemap_snr; - old_pgdat_snr = pgdat_snr; - - usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); - if (usemap_nid != nid) { - pr_info("node %d must be removed before remove section %ld\n", - nid, usemap_snr); - return; - } - /* - * There is a circular dependency. - * Some platforms allow un-removable section because they will just - * gather other removable sections for dynamic partitioning. - * Just notify un-removable section's number here. - */ - pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n", - usemap_snr, pgdat_snr, nid); -} -#else -static struct mem_section_usage * __init -sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long size) -{ - return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); -} - -static void __init check_usemap_section_nr(int nid, - struct mem_section_usage *usage) -{ -} -#endif /* CONFIG_MEMORY_HOTREMOVE */ - #ifdef CONFIG_SPARSEMEM_VMEMMAP unsigned long __init section_map_size(void) { @@ -486,7 +390,6 @@ void __init sparse_init_early_section(int nid, struct page *map, unsigned long pnum, unsigned long flags) { BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end); - check_usemap_section_nr(nid, sparse_usagebuf); sparse_init_one_section(__nr_to_section(pnum), pnum, map, sparse_usagebuf, SECTION_IS_EARLY | flags); sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size(); @@ -497,8 +400,7 @@ static int __init sparse_usage_init(int nid, unsigned long map_count) unsigned long size; size = mem_section_usage_size() * map_count; - sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section( - NODE_DATA(nid), size); + sparse_usagebuf = memblock_alloc_node(size, SMP_CACHE_BYTES, nid); if (!sparse_usagebuf) { sparse_usagebuf_end = NULL; return -ENOMEM; From fead6dcff83b02f8d6dc3c1ebbe4e09c05c54ee5 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:43 +0100 Subject: [PATCH 272/369] mm: prepare to move subsection_map_init() to mm/sparse-vmemmap.c We want to move subsection_map_init() to mm/sparse-vmemmap.c. To prepare for getting rid of subsection_map_init() in mm/sparse.c completely, use a static inline function for !CONFIG_SPARSEMEM_VMEMMAP. While at it, move the declaration to internal.h and rename it to "sparse_init_subsection_map()". Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-11-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 --- mm/internal.h | 12 ++++++++++++ mm/mm_init.c | 2 +- mm/sparse.c | 6 +----- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f651baf7e2b..7cf4a194aea2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1982,8 +1982,6 @@ struct mem_section_usage { unsigned long pageblock_flags[0]; }; -void subsection_map_init(unsigned long pfn, unsigned long nr_pages); - struct page; struct page_ext; struct mem_section { @@ -2376,7 +2374,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid -#define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* diff --git a/mm/internal.h b/mm/internal.h index 62d80fd37ae1..11b0c91b6d9d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -959,12 +959,24 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int, bool); +/* + * mm/sparse.c + */ #ifdef CONFIG_SPARSEMEM void sparse_init(void); #else static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_SPARSEMEM_VMEMMAP +void sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages); +#else +static inline void sparse_init_subsection_map(unsigned long pfn, + unsigned long nr_pages) +{ +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b261f86ba6f..4324b93ccebd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1896,7 +1896,7 @@ static void __init free_area_init(void) pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); - subsection_map_init(start_pfn, end_pfn - start_pfn); + sparse_init_subsection_map(start_pfn, end_pfn - start_pfn); } /* Initialise every node */ diff --git a/mm/sparse.c b/mm/sparse.c index e2048b1fbf5f..c96ac5e70c22 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -185,7 +185,7 @@ static void subsection_mask_set(unsigned long *map, unsigned long pfn, bitmap_set(map, idx, end - idx + 1); } -void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +void __init sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages) { int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); @@ -207,10 +207,6 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) nr_pages -= pfns; } } -#else -void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) -{ -} #endif /* Record a memory area against a node. */ From b551ed94d959900996364c810c5f96e89640b200 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:44 +0100 Subject: [PATCH 273/369] mm/sparse: drop set_section_nid() from sparse_add_section() CONFIG_MEMORY_HOTPLUG is CONFIG_SPARSEMEM_VMEMMAP-only. And CONFIG_SPARSEMEM_VMEMMAP implies that NODE_NOT_IN_PAGE_FLAGS cannot be set: see include/linux/page-flags-layout.h ... #elif defined(CONFIG_SPARSEMEM_VMEMMAP) #error "Vmemmap: No space for nodes field in page flags" ... Which implies that the node is always stored in page flags and NODE_NOT_IN_PAGE_FLAGS cannot be set. Therefore, set_section_nid() is a NOP on CONFIG_SPARSEMEM_VMEMMAP. So let's remove the set_section_nid() call to prepare for moving CONFIG_MEMORY_HOTPLUG to mm/sparse-vmemmap.c Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-12-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/sparse.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index c96ac5e70c22..5c9cad390282 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -765,7 +765,6 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); - set_section_nid(section_nr, nid); __section_mark_present(ms, section_nr); /* Align memmap to section boundary in the subsection case */ From f62a3bf227c95a105fccb5a2062367387cd49430 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:45 +0100 Subject: [PATCH 274/369] mm/sparse: move sparse_init_one_section() to internal.h While at it, convert the BUG_ON to a VM_WARN_ON_ONCE, avoid long lines, and merge sparse_encode_mem_map() into its only caller sparse_init_one_section(). Clarify the comment a bit, pointing at page_to_pfn(). [david@kernel.org: s/VM_WARN_ON/VM_WARN_ON_ONCE/] Link: https://lkml.kernel.org/r/6b04c1a1-74e7-42e8-8523-a40802e5dacc@kernel.org Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-13-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- mm/internal.h | 22 ++++++++++++++++++++++ mm/sparse.c | 24 ------------------------ 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7cf4a194aea2..ed335567d64e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1988,7 +1988,7 @@ struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. - * (see sparse.c::sparse_init_one_section()) + * (see sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. diff --git a/mm/internal.h b/mm/internal.h index 11b0c91b6d9d..e14f58527688 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -964,6 +964,28 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, */ #ifdef CONFIG_SPARSEMEM void sparse_init(void); + +static inline void sparse_init_one_section(struct mem_section *ms, + unsigned long pnum, struct page *mem_map, + struct mem_section_usage *usage, unsigned long flags) +{ + unsigned long coded_mem_map; + + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); + + /* + * We encode the start PFN of the section into the mem_map such that + * page_to_pfn() on !CONFIG_SPARSEMEM_VMEMMAP can simply subtract it + * from the page pointer to obtain the PFN. + */ + coded_mem_map = (unsigned long)(mem_map - section_nr_to_pfn(pnum)); + VM_WARN_ON_ONCE(coded_mem_map & ~SECTION_MAP_MASK); + + ms->section_mem_map &= ~SECTION_MAP_MASK; + ms->section_mem_map |= coded_mem_map; + ms->section_mem_map |= flags | SECTION_HAS_MEM_MAP; + ms->usage = usage; +} #else static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ diff --git a/mm/sparse.c b/mm/sparse.c index 5c9cad390282..ed5de1a25f04 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -256,30 +256,6 @@ static void __init memblocks_present(void) memory_present(nid, start, end); } -/* - * Subtle, we encode the real pfn into the mem_map such that - * the identity pfn - section_mem_map will return the actual - * physical page frame number. - */ -static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) -{ - unsigned long coded_mem_map = - (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); - BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); - BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); - return coded_mem_map; -} - -static void __meminit sparse_init_one_section(struct mem_section *ms, - unsigned long pnum, struct page *mem_map, - struct mem_section_usage *usage, unsigned long flags) -{ - ms->section_mem_map &= ~SECTION_MAP_MASK; - ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) - | SECTION_HAS_MEM_MAP | flags; - ms->usage = usage; -} - static unsigned long usemap_size(void) { return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); From 08e5f77c37206da5a8340afdbf23b61e722c0ed3 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:46 +0100 Subject: [PATCH 275/369] mm/sparse: move __section_mark_present() to internal.h Let's prepare for moving memory hotplug handling from sparse.c to sparse-vmemmap.c by moving __section_mark_present() to internal.h. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-14-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/internal.h | 9 +++++++++ mm/sparse.c | 8 -------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index e14f58527688..4e753bbf00ae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -986,6 +986,15 @@ static inline void sparse_init_one_section(struct mem_section *ms, ms->section_mem_map |= flags | SECTION_HAS_MEM_MAP; ms->usage = usage; } + +static inline void __section_mark_present(struct mem_section *ms, + unsigned long section_nr) +{ + if (section_nr > __highest_present_section_nr) + __highest_present_section_nr = section_nr; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; +} #else static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ diff --git a/mm/sparse.c b/mm/sparse.c index ed5de1a25f04..ecd4c41c0ff0 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -161,14 +161,6 @@ static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * those loops early. */ unsigned long __highest_present_section_nr; -static void __section_mark_present(struct mem_section *ms, - unsigned long section_nr) -{ - if (section_nr > __highest_present_section_nr) - __highest_present_section_nr = section_nr; - - ms->section_mem_map |= SECTION_MARKED_PRESENT; -} static inline unsigned long first_present_section_nr(void) { From 738de20c4fafe64290c5086d683254f60e837db6 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Fri, 20 Mar 2026 23:13:47 +0100 Subject: [PATCH 276/369] mm/sparse: move memory hotplug bits to sparse-vmemmap.c Let's move all memory hoptplug related code to sparse-vmemmap.c. We only have to expose sparse_index_init(). While at it, drop the definition of sparse_index_init() for !CONFIG_SPARSEMEM, which is unused, and place the declaration in internal.h. Link: https://lkml.kernel.org/r/20260320-sparsemem_cleanups-v2-15-096addc8800d@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Mike Rapoport (Microsoft) Cc: Axel Rasmussen Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 - mm/internal.h | 4 + mm/sparse-vmemmap.c | 304 ++++++++++++++++++++++++++++++++++++++++ mm/sparse.c | 310 +---------------------------------------- 4 files changed, 310 insertions(+), 309 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ed335567d64e..4a20df132258 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2370,7 +2370,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #endif #else -#define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid diff --git a/mm/internal.h b/mm/internal.h index 4e753bbf00ae..9ae0ee6c34f9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -964,6 +964,7 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, */ #ifdef CONFIG_SPARSEMEM void sparse_init(void); +int sparse_index_init(unsigned long section_nr, int nid); static inline void sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, @@ -999,6 +1000,9 @@ static inline void __section_mark_present(struct mem_section *ms, static inline void sparse_init(void) {} #endif /* CONFIG_SPARSEMEM */ +/* + * mm/sparse-vmemmap.c + */ #ifdef CONFIG_SPARSEMEM_VMEMMAP void sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages); #else diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 842ed2f0bce6..24a37676cecb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -591,3 +591,307 @@ void __init sparse_vmemmap_init_nid_late(int nid) hugetlb_vmemmap_init_late(nid); } #endif + +static void subsection_mask_set(unsigned long *map, unsigned long pfn, + unsigned long nr_pages) +{ + int idx = subsection_map_index(pfn); + int end = subsection_map_index(pfn + nr_pages - 1); + + bitmap_set(map, idx, end - idx + 1); +} + +void __init sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); + unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); + + for (nr = start_sec_nr; nr <= end_sec_nr; nr++) { + struct mem_section *ms; + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + ms = __nr_to_section(nr); + subsection_mask_set(ms->usage->subsection_map, pfn, pfns); + + pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, + pfns, subsection_map_index(pfn), + subsection_map_index(pfn + pfns - 1)); + + pfn += pfns; + nr_pages -= pfns; + } +} + +#ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms = __nr_to_section(section_nr); + + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +/* Mark all memory sections within the pfn range as offline */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms = __nr_to_section(section_nr); + + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} + +static struct page * __meminit populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); +} + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); + + vmemmap_free(start, end, altmap); +} +static void free_map_bootmem(struct page *memmap) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end, NULL); +} + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return -EINVAL; + + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return bitmap_empty(&ms->usage->subsection_map[0], + SUBSECTIONS_PER_SECTION); +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + struct mem_section *ms = __pfn_to_section(pfn); + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + unsigned long *subsection_map; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + return rc; +} + +/* + * To deactivate a memory region, there are 3 cases to handle: + * + * 1. deactivation of a partial hot-added section: + * a) section was present at memory init. + * b) section was hot-added post memory init. + * 2. deactivation of a complete hot-added section. + * 3. deactivation of a complete section from memory init. + * + * For 1, when subsection_map does not empty we will not be freeing the + * usage map, but still need to free the vmemmap range. + */ +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + bool empty; + + if (clear_subsection_map(pfn, nr_pages)) + return; + + empty = is_subsection_map_empty(ms); + if (empty) { + /* + * Mark the section invalid so that valid_section() + * return false. This prevents code from dereferencing + * ms->usage array. + */ + ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; + + /* + * When removing an early section, the usage map is kept (as the + * usage maps of other sections fall into the same page). It + * will be re-used when re-adding the section - which is then no + * longer an early section. If the usage map is PageReserved, it + * was allocated during boot. + */ + if (!PageReserved(virt_to_page(ms->usage))) { + kfree_rcu(ms->usage, rcu); + WRITE_ONCE(ms->usage, NULL); + } + memmap = pfn_to_page(SECTION_ALIGN_DOWN(pfn)); + } + + /* + * The memmap of early sections is always fully populated. See + * section_activate() and pfn_valid() . + */ + if (!section_is_early) { + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); + depopulate_section_memmap(pfn, nr_pages, altmap); + } else if (memmap) { + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), + PAGE_SIZE))); + free_map_bootmem(memmap); + } + + if (empty) + ms->section_mem_map = (unsigned long)NULL; +} + +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + struct page *memmap; + int rc; + + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; + } + + rc = fill_subsection_map(pfn, nr_pages); + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } + + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); + + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); + + return memmap; +} + +/** + * sparse_add_section - add a memory section, or populate an existing one + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section + * @altmap: alternate pfns to allocate the memmap backing store + * @pgmap: alternate compound page geometry for devmap mappings + * + * This is only intended for hotplug. + * + * Note that only VMEMMAP supports sub-section aligned hotplug, + * the proper alignment and size are gated by check_pfn_span(). + * + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. + */ +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + struct page *memmap; + int ret; + + ret = sparse_index_init(section_nr, nid); + if (ret < 0) + return ret; + + memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); + + /* + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. + */ + page_init_poison(memmap, sizeof(struct page) * nr_pages); + + ms = __nr_to_section(section_nr); + __section_mark_present(ms, section_nr); + + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_page(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; +} + +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + if (WARN_ON_ONCE(!valid_section(ms))) + return; + + section_deactivate(pfn, nr_pages, altmap); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/sparse.c b/mm/sparse.c index ecd4c41c0ff0..007fd52c621e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -79,7 +79,7 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) return section; } -static int __meminit sparse_index_init(unsigned long section_nr, int nid) +int __meminit sparse_index_init(unsigned long section_nr, int nid) { unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; @@ -103,7 +103,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) return 0; } #else /* !SPARSEMEM_EXTREME */ -static inline int sparse_index_init(unsigned long section_nr, int nid) +int sparse_index_init(unsigned long section_nr, int nid) { return 0; } @@ -167,40 +167,6 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static void subsection_mask_set(unsigned long *map, unsigned long pfn, - unsigned long nr_pages) -{ - int idx = subsection_map_index(pfn); - int end = subsection_map_index(pfn + nr_pages - 1); - - bitmap_set(map, idx, end - idx + 1); -} - -void __init sparse_init_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); - unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); - - for (nr = start_sec_nr; nr <= end_sec_nr; nr++) { - struct mem_section *ms; - unsigned long pfns; - - pfns = min(nr_pages, PAGES_PER_SECTION - - (pfn & ~PAGE_SECTION_MASK)); - ms = __nr_to_section(nr); - subsection_mask_set(ms->usage->subsection_map, pfn, pfns); - - pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, - pfns, subsection_map_index(pfn), - subsection_map_index(pfn + pfns - 1)); - - pfn += pfns; - nr_pages -= pfns; - } -} -#endif - /* Record a memory area against a node. */ static void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -482,275 +448,3 @@ void __init sparse_init(void) sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); vmemmap_populate_print_last(); } - -#ifdef CONFIG_MEMORY_HOTPLUG - -/* Mark all memory sections within the pfn range as online */ -void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms = __nr_to_section(section_nr); - - ms->section_mem_map |= SECTION_IS_ONLINE; - } -} - -/* Mark all memory sections within the pfn range as offline */ -void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - struct mem_section *ms = __nr_to_section(section_nr); - - ms->section_mem_map &= ~SECTION_IS_ONLINE; - } -} - -static struct page * __meminit populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); -} - -static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - unsigned long start = (unsigned long) pfn_to_page(pfn); - unsigned long end = start + nr_pages * sizeof(struct page); - - vmemmap_free(start, end, altmap); -} -static void free_map_bootmem(struct page *memmap) -{ - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); - - vmemmap_free(start, end, NULL); -} - -static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; - struct mem_section *ms = __pfn_to_section(pfn); - unsigned long *subsection_map = ms->usage - ? &ms->usage->subsection_map[0] : NULL; - - subsection_mask_set(map, pfn, nr_pages); - if (subsection_map) - bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); - - if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), - "section already deactivated (%#lx + %ld)\n", - pfn, nr_pages)) - return -EINVAL; - - bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - return 0; -} - -static bool is_subsection_map_empty(struct mem_section *ms) -{ - return bitmap_empty(&ms->usage->subsection_map[0], - SUBSECTIONS_PER_SECTION); -} - -static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) -{ - struct mem_section *ms = __pfn_to_section(pfn); - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - unsigned long *subsection_map; - int rc = 0; - - subsection_mask_set(map, pfn, nr_pages); - - subsection_map = &ms->usage->subsection_map[0]; - - if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) - rc = -EINVAL; - else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) - rc = -EEXIST; - else - bitmap_or(subsection_map, map, subsection_map, - SUBSECTIONS_PER_SECTION); - - return rc; -} - -/* - * To deactivate a memory region, there are 3 cases to handle: - * - * 1. deactivation of a partial hot-added section: - * a) section was present at memory init. - * b) section was hot-added post memory init. - * 2. deactivation of a complete hot-added section. - * 3. deactivation of a complete section from memory init. - * - * For 1, when subsection_map does not empty we will not be freeing the - * usage map, but still need to free the vmemmap range. - */ -static void section_deactivate(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - bool section_is_early = early_section(ms); - struct page *memmap = NULL; - bool empty; - - if (clear_subsection_map(pfn, nr_pages)) - return; - - empty = is_subsection_map_empty(ms); - if (empty) { - /* - * Mark the section invalid so that valid_section() - * return false. This prevents code from dereferencing - * ms->usage array. - */ - ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; - - /* - * When removing an early section, the usage map is kept (as the - * usage maps of other sections fall into the same page). It - * will be re-used when re-adding the section - which is then no - * longer an early section. If the usage map is PageReserved, it - * was allocated during boot. - */ - if (!PageReserved(virt_to_page(ms->usage))) { - kfree_rcu(ms->usage, rcu); - WRITE_ONCE(ms->usage, NULL); - } - memmap = pfn_to_page(SECTION_ALIGN_DOWN(pfn)); - } - - /* - * The memmap of early sections is always fully populated. See - * section_activate() and pfn_valid() . - */ - if (!section_is_early) { - memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); - depopulate_section_memmap(pfn, nr_pages, altmap); - } else if (memmap) { - memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), - PAGE_SIZE))); - free_map_bootmem(memmap); - } - - if (empty) - ms->section_mem_map = (unsigned long)NULL; -} - -static struct page * __meminit section_activate(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - struct mem_section_usage *usage = NULL; - struct page *memmap; - int rc; - - if (!ms->usage) { - usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); - if (!usage) - return ERR_PTR(-ENOMEM); - ms->usage = usage; - } - - rc = fill_subsection_map(pfn, nr_pages); - if (rc) { - if (usage) - ms->usage = NULL; - kfree(usage); - return ERR_PTR(rc); - } - - /* - * The early init code does not consider partially populated - * initial sections, it simply assumes that memory will never be - * referenced. If we hot-add memory into such a section then we - * do not need to populate the memmap and can simply reuse what - * is already there. - */ - if (nr_pages < PAGES_PER_SECTION && early_section(ms)) - return pfn_to_page(pfn); - - memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); - if (!memmap) { - section_deactivate(pfn, nr_pages, altmap); - return ERR_PTR(-ENOMEM); - } - memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); - - return memmap; -} - -/** - * sparse_add_section - add a memory section, or populate an existing one - * @nid: The node to add section on - * @start_pfn: start pfn of the memory range - * @nr_pages: number of pfns to add in the section - * @altmap: alternate pfns to allocate the memmap backing store - * @pgmap: alternate compound page geometry for devmap mappings - * - * This is only intended for hotplug. - * - * Note that only VMEMMAP supports sub-section aligned hotplug, - * the proper alignment and size are gated by check_pfn_span(). - * - * - * Return: - * * 0 - On success. - * * -EEXIST - Section has been present. - * * -ENOMEM - Out of memory. - */ -int __meminit sparse_add_section(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) -{ - unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct mem_section *ms; - struct page *memmap; - int ret; - - ret = sparse_index_init(section_nr, nid); - if (ret < 0) - return ret; - - memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); - if (IS_ERR(memmap)) - return PTR_ERR(memmap); - - /* - * Poison uninitialized struct pages in order to catch invalid flags - * combinations. - */ - page_init_poison(memmap, sizeof(struct page) * nr_pages); - - ms = __nr_to_section(section_nr); - __section_mark_present(ms, section_nr); - - /* Align memmap to section boundary in the subsection case */ - if (section_nr_to_pfn(section_nr) != start_pfn) - memmap = pfn_to_page(section_nr_to_pfn(section_nr)); - sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); - - return 0; -} - -void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - - if (WARN_ON_ONCE(!valid_section(ms))) - return; - - section_deactivate(pfn, nr_pages, altmap); -} -#endif /* CONFIG_MEMORY_HOTPLUG */ From 078f80f909ba9fa3060e89dc634ff4b1defc43d3 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Thu, 19 Mar 2026 09:19:40 +0100 Subject: [PATCH 277/369] mm: remove CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE Patch series "mm: remove CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE and cleanup CONFIG_MIGRATION". While working on memory hotplug code cleanups, I realized that CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE is not really required anymore. Changing that revealed some rather nasty looking CONFIG_MIGRATION handling. Let's clean that up by introducing a dedicated CONFIG_NUMA_MIGRATION option and reducing the dependencies that CONFIG_MIGRATION has. This patch (of 2): All architectures that select CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE also select CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG. So we can just remove CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE. For CONFIG_MIGRATION, make it depend on CONFIG_MEMORY_HOTREMOVE instead, and make CONFIG_MEMORY_HOTREMOVE select CONFIG_MIGRATION (just like CONFIG_CMA and CONFIG_COMPACTION already do). We'll clean up CONFIG_MIGRATION next. Link: https://lkml.kernel.org/r/20260319-config_migration-v1-0-42270124966f@kernel.org Link: https://lkml.kernel.org/r/20260319-config_migration-v1-1-42270124966f@kernel.org Signed-off-by: David Hildenbrand (Arm) Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: Joshua Hahn Reviewed-by: Gregory Price Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Jonathan Cameron Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: "Borislav Petkov (AMD)" Cc: Byungchul Park Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Brost Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Rakie Kim Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 - arch/loongarch/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - mm/Kconfig | 9 +++------ 7 files changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 38dba5f7e4d2..bcd9f5bc66e2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -16,7 +16,6 @@ config ARM64 select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG - select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 92068ff38685..ac714d14133a 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -12,7 +12,6 @@ config LOONGARCH select ARCH_NEEDS_DEFER_KASAN select ARCH_DISABLE_KASAN_INLINE select ARCH_ENABLE_MEMORY_HOTPLUG - select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CPU_FINALIZE_INIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 10240cb80904..617758d7155b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -126,7 +126,6 @@ config PPC select ARCH_DISABLE_KASAN_INLINE if PPC_RADIX_MMU select ARCH_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE select ARCH_ENABLE_MEMORY_HOTPLUG - select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 90c531e6abf5..61a9d8d3ea64 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -21,7 +21,6 @@ config RISCV select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM_VMEMMAP - select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_BINFMT_FLAT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index edc927d9e85a..d01800962d84 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -85,7 +85,6 @@ config S390 select ARCH_32BIT_USTAT_F_TINODE select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM - select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CC_CAN_LINK diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e2df1b147184..c290fe363f27 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,6 @@ config X86 select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 - select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE) select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI diff --git a/mm/Kconfig b/mm/Kconfig index c012944938a7..b2e21d873d3f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -465,9 +465,6 @@ config HAVE_BOOTMEM_INFO_NODE config ARCH_ENABLE_MEMORY_HOTPLUG bool -config ARCH_ENABLE_MEMORY_HOTREMOVE - bool - # eventually, we can have this option just 'select SPARSEMEM' menuconfig MEMORY_HOTPLUG bool "Memory hotplug" @@ -540,8 +537,8 @@ endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) - depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE - depends on MIGRATION + depends on MEMORY_HOTPLUG + select MIGRATION config MHP_MEMMAP_ON_MEMORY def_bool y @@ -636,7 +633,7 @@ config PAGE_REPORTING config MIGRATION bool "Page migration" default y - depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU + depends on (NUMA || MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in From 6ebf98d71f9b509e833e0af00795ad3723d2f410 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Thu, 19 Mar 2026 09:19:41 +0100 Subject: [PATCH 278/369] mm: introduce CONFIG_NUMA_MIGRATION and simplify CONFIG_MIGRATION CONFIG_MEMORY_HOTREMOVE, CONFIG_COMPACTION and CONFIG_CMA all select CONFIG_MIGRATION, because they require it to work (users). Only CONFIG_NUMA_BALANCING and CONFIG_BALLOON_MIGRATION depend on CONFIG_MIGRATION. CONFIG_BALLOON_MIGRATION is not an actual user, but an implementation of migration support, so the dependency is correct (CONFIG_BALLOON_MIGRATION does not make any sense without CONFIG_MIGRATION). However, kconfig-language.rst clearly states "In general use select only for non-visible symbols". So far CONFIG_MIGRATION is user-visible ... and the dependencies rather confusing. The whole reason why CONFIG_MIGRATION is user-visible is because of CONFIG_NUMA: some users might want CONFIG_NUMA but not page migration support. Let's clean all that up by introducing a dedicated CONFIG_NUMA_MIGRATION config option for that purpose only. Make CONFIG_NUMA_BALANCING that so far depended on CONFIG_NUMA && CONFIG_MIGRATION to depend on CONFIG_MIGRATION instead. CONFIG_NUMA_MIGRATION will depend on CONFIG_NUMA && CONFIG_MMU. CONFIG_NUMA_MIGRATION is user-visible and will default to "y". We use that default so new configs will automatically enable it, just like it was the case with CONFIG_MIGRATION. The downside is that some configs that used to have CONFIG_MIGRATION=n might get it re-enabled by CONFIG_NUMA_MIGRATION=y, which shouldn't be a problem. CONFIG_MIGRATION is now a non-visible config option. Any code that select CONFIG_MIGRATION (as before) must depend directly or indirectly on CONFIG_MMU. CONFIG_NUMA_MIGRATION is responsible for any NUMA migration code, which is mempolicy migration code, memory-tiering code, and move_pages() code in migrate.c. CONFIG_NUMA_BALANCING uses its functionality. Note that this implies that with CONFIG_NUMA_MIGRATION=n, move_pages() will not be available even though CONFIG_MIGRATION=y, which is an expected change. In migrate.c, we can remove the CONFIG_NUMA check as both CONFIG_NUMA_MIGRATION and CONFIG_NUMA_BALANCING depend on it. With this change, CONFIG_MIGRATION is an internal config, all users of migration selects CONFIG_MIGRATION, and only CONFIG_BALLOON_MIGRATION depends on it. Link: https://lkml.kernel.org/r/20260319-config_migration-v1-2-42270124966f@kernel.org Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: Zi Yan Reviewed-by: Jonathan Cameron Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alistair Popple Cc: "Borislav Petkov (AMD)" Cc: Byungchul Park Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Gregory Price Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Joshua Hahn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Matthew Brost Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Rakie Kim Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 2 +- init/Kconfig | 2 +- mm/Kconfig | 24 ++++++++++++------------ mm/memory-tiers.c | 12 ++++++------ mm/mempolicy.c | 2 +- mm/migrate.c | 5 ++--- 6 files changed, 23 insertions(+), 24 deletions(-) diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 96987d9d95a8..7999c58629ee 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -52,7 +52,7 @@ int mt_perf_to_adistance(struct access_coordinate *perf, int *adist); struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types); void mt_put_memory_types(struct list_head *memory_types); -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION int next_demotion_node(int node, const nodemask_t *allowed_mask); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); bool node_is_toptier(int node); diff --git a/init/Kconfig b/init/Kconfig index 444ce811ea67..3648e401b78b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -997,7 +997,7 @@ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION && !PREEMPT_RT + depends on SMP && NUMA_MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when diff --git a/mm/Kconfig b/mm/Kconfig index b2e21d873d3f..bd283958d675 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -627,20 +627,20 @@ config PAGE_REPORTING those pages to another entity, such as a hypervisor, so that the memory can be freed within the host for other uses. -# -# support for page migration -# -config MIGRATION - bool "Page migration" +config NUMA_MIGRATION + bool "NUMA page migration" default y - depends on (NUMA || MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU + depends on NUMA && MMU + select MIGRATION help - Allows the migration of the physical location of pages of processes - while the virtual addresses are not changed. This is useful in - two situations. The first is on NUMA systems to put pages nearer - to the processors accessing. The second is when allocating huge - pages as migration can relocate pages to satisfy a huge page - allocation instead of reclaiming. + Support the migration of pages to other NUMA nodes, available to + user space through interfaces like migrate_pages(), move_pages(), + and mbind(). Selecting this option also enables support for page + demotion for memory tiering. + +config MIGRATION + bool + depends on MMU config DEVICE_MIGRATION def_bool MIGRATION && ZONE_DEVICE diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 986f809376eb..54851d8a195b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -69,7 +69,7 @@ bool folio_use_access_time(struct folio *folio) } #endif -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION static int top_tier_adistance; /* * node_demotion[] examples: @@ -129,7 +129,7 @@ static int top_tier_adistance; * */ static struct demotion_nodes *node_demotion __read_mostly; -#endif /* CONFIG_MIGRATION */ +#endif /* CONFIG_NUMA_MIGRATION */ static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); @@ -273,7 +273,7 @@ static struct memory_tier *__node_get_memory_tier(int node) lockdep_is_held(&memory_tier_lock)); } -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION bool node_is_toptier(int node) { bool toptier; @@ -519,7 +519,7 @@ static void establish_demotion_targets(void) #else static inline void establish_demotion_targets(void) {} -#endif /* CONFIG_MIGRATION */ +#endif /* CONFIG_NUMA_MIGRATION */ static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) { @@ -911,7 +911,7 @@ static int __init memory_tier_init(void) if (ret) panic("%s() failed to register memory tier subsystem\n", __func__); -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION node_demotion = kzalloc_objs(struct demotion_nodes, nr_node_ids); WARN_ON(!node_demotion); #endif @@ -938,7 +938,7 @@ subsys_initcall(memory_tier_init); bool numa_demotion_enabled = false; -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION #ifdef CONFIG_SYSFS static ssize_t demotion_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e5528c35bbb8..fd08771e2057 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1239,7 +1239,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } -#ifdef CONFIG_MIGRATION +#ifdef CONFIG_NUMA_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { diff --git a/mm/migrate.c b/mm/migrate.c index 3323fc96b1cd..4241eb6eca00 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2222,8 +2222,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } -#ifdef CONFIG_NUMA - +#ifdef CONFIG_NUMA_MIGRATION static int store_status(int __user *status, int start, int value, int nr) { while (nr-- > 0) { @@ -2622,6 +2621,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, { return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } +#endif /* CONFIG_NUMA_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING /* @@ -2764,4 +2764,3 @@ int migrate_misplaced_folio(struct folio *folio, int node) return nr_remaining ? -EAGAIN : 0; } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_NUMA */ From dc711106a0bc76a30e0fbd16ed4d348171547d9a Mon Sep 17 00:00:00 2001 From: teawater Date: Thu, 19 Mar 2026 14:59:24 +0800 Subject: [PATCH 279/369] zsmalloc: return -EBUSY for zspage migration lock contention movable_operations::migrate_page() should return an appropriate error code for temporary migration failures so the migration core can handle them correctly. zs_page_migrate() currently returns -EINVAL when zspage_write_trylock() fails. That path reflects transient lock contention, not invalid input, so -EINVAL is clearly wrong. However, -EAGAIN is also inappropriate here: the zspage's reader-lock owner may hold the lock for an unbounded duration due to slow decompression or reader-lock owner preemption. Since migration retries are bounded by NR_MAX_MIGRATE_PAGES_RETRY and performed with virtually no delay between attempts, there is no guarantee the lock will be released in time for a retry to succeed. -EAGAIN implies "try again soon", which does not hold in this case. Return -EBUSY instead, which more accurately conveys that the resource is occupied and migration cannot proceed at this time. Link: https://lkml.kernel.org/r/20260319065924.69337-1-hui.zhu@linux.dev Signed-off-by: teawater Acked-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2c1430bf8d57..e7417ece1c12 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1727,7 +1727,19 @@ static int zs_page_migrate(struct page *newpage, struct page *page, if (!zspage_write_trylock(zspage)) { spin_unlock(&class->lock); write_unlock(&pool->lock); - return -EINVAL; + /* + * Return -EBUSY but not -EAGAIN: the zspage's reader-lock + * owner may hold the lock for an unbounded duration due to a + * slow decompression or reader-lock owner preemption. + * Since migration retries are bounded by + * NR_MAX_MIGRATE_PAGES_RETRY and performed with virtually no + * delay between attempts, there is no guarantee the lock will + * be released in time for a retry to succeed. + * -EAGAIN implies "try again soon", which does not hold here. + * -EBUSY more accurately conveys "resource is occupied, + * migration cannot proceed". + */ + return -EBUSY; } /* We're committed, tell the world that this is a Zsmalloc page. */ From a6a8c087dce00eac0c6d03e560b0fa3d529afa5f Mon Sep 17 00:00:00 2001 From: Leno Hou Date: Thu, 19 Mar 2026 00:30:49 +0800 Subject: [PATCH 280/369] mm/mglru: fix cgroup OOM during MGLRU state switching When the Multi-Gen LRU (MGLRU) state is toggled dynamically, a race condition exists between the state switching and the memory reclaim path. This can lead to unexpected cgroup OOM kills, even when plenty of reclaimable memory is available. Problem Description ================== The issue arises from a "reclaim vacuum" during the transition. 1. When disabling MGLRU, lru_gen_change_state() sets lrugen->enabled to false before the pages are drained from MGLRU lists back to traditional LRU lists. 2. Concurrent reclaimers in shrink_lruvec() see lrugen->enabled as false and skip the MGLRU path. 3. However, these pages might not have reached the traditional LRU lists yet, or the changes are not yet visible to all CPUs due to a lack of synchronization. 4. get_scan_count() subsequently finds traditional LRU lists empty, concludes there is no reclaimable memory, and triggers an OOM kill. A similar race can occur during enablement, where the reclaimer sees the new state but the MGLRU lists haven't been populated via fill_evictable() yet. Solution ======== Introduce a 'switching' state (`lru_switch`) to bridge the transition. When transitioning, the system enters this intermediate state where the reclaimer is forced to attempt both MGLRU and traditional reclaim paths sequentially. This ensures that folios remain visible to at least one reclaim mechanism until the transition is fully materialized across all CPUs. Race & Mitigation ================ A race window exists between checking the 'draining' state and performing the actual list operations. For instance, a reclaimer might observe the draining state as false just before it changes, leading to a suboptimal reclaim path decision. However, this impact is effectively mitigated by the kernel's reclaim retry mechanism (e.g., in do_try_to_free_pages). If a reclaimer pass fails to find eligible folios due to a state transition race, subsequent retries in the loop will observe the updated state and correctly direct the scan to the appropriate LRU lists. This ensures the transient inconsistency does not escalate into a terminal OOM kill. This effectively reduce the race window that previously triggered OOMs under high memory pressure. This fix has been verified on v7.0.0-rc1; dynamic toggling of MGLRU functions correctly without triggering unexpected OOM kills. Link: https://lkml.kernel.org/r/20260319-b4-switch-mglru-v2-v5-1-8898491e5f17@gmail.com Signed-off-by: Leno Hou Acked-by: Yafang Shao Reviewed-by: Barry Song Reviewed-by: Axel Rasmussen Cc: Yuanchu Xie Cc: Wei Xu Cc: Jialing Wang Cc: Yu Zhao Cc: Kairui Song Cc: Bingfang Guo Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 +++++++++++ mm/rmap.c | 7 ++++++- mm/vmscan.c | 33 ++++++++++++++++++++++++--------- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index fa2d6ba811b5..2aedcff6a2c1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -102,6 +102,12 @@ static __always_inline enum lru_list folio_lru_list(const struct folio *folio) #ifdef CONFIG_LRU_GEN +static inline bool lru_gen_switching(void) +{ + DECLARE_STATIC_KEY_FALSE(lru_switch); + + return static_branch_unlikely(&lru_switch); +} #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { @@ -316,6 +322,11 @@ static inline bool lru_gen_enabled(void) return false; } +static inline bool lru_gen_switching(void) +{ + return false; +} + static inline bool lru_gen_in_fault(void) { return false; diff --git a/mm/rmap.c b/mm/rmap.c index abe4712a220c..78b7fb5f367c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -973,7 +973,12 @@ static bool folio_referenced_one(struct folio *folio, nr = folio_pte_batch(folio, pvmw.pte, pteval, max_nr); } - if (lru_gen_enabled() && pvmw.pte) { + /* + * When LRU is switching, we don’t know where the surrounding folios + * are. —they could be on active/inactive lists or on MGLRU. So the + * simplest approach is to disable this look-around optimization. + */ + if (lru_gen_enabled() && !lru_gen_switching() && pvmw.pte) { if (lru_gen_look_around(&pvmw, nr)) referenced++; } else if (pvmw.pte) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 641a6063f375..42f834c508bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -905,7 +905,7 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; - if (lru_gen_enabled()) { + if (lru_gen_enabled() && !lru_gen_switching()) { if (!referenced_ptes) return FOLIOREF_RECLAIM; @@ -2308,7 +2308,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) unsigned long file; struct lruvec *target_lruvec; - if (lru_gen_enabled()) + if (lru_gen_enabled() && !lru_gen_switching()) return; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -2647,6 +2647,7 @@ static bool can_age_anon_pages(struct lruvec *lruvec, #ifdef CONFIG_LRU_GEN +DEFINE_STATIC_KEY_FALSE(lru_switch); #ifdef CONFIG_LRU_GEN_ENABLED DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) @@ -5181,6 +5182,8 @@ static void lru_gen_change_state(bool enabled) if (enabled == lru_gen_enabled()) goto unlock; + static_branch_enable_cpuslocked(&lru_switch); + if (enabled) static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); else @@ -5211,6 +5214,9 @@ static void lru_gen_change_state(bool enabled) cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + static_branch_disable_cpuslocked(&lru_switch); + unlock: mutex_unlock(&state_mutex); put_online_mems(); @@ -5783,9 +5789,12 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled() && !root_reclaim(sc)) { + if ((lru_gen_enabled() || lru_gen_switching()) && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); - return; + + if (!lru_gen_switching()) + return; + } get_scan_count(lruvec, sc, nr); @@ -6045,10 +6054,13 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; - if (lru_gen_enabled() && root_reclaim(sc)) { + if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { memset(&sc->nr, 0, sizeof(sc->nr)); lru_gen_shrink_node(pgdat, sc); - return; + + if (!lru_gen_switching()) + return; + } target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -6318,7 +6330,7 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; - if (lru_gen_enabled()) + if (lru_gen_enabled() && !lru_gen_switching()) return; target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); @@ -6708,9 +6720,12 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) struct mem_cgroup *memcg; struct lruvec *lruvec; - if (lru_gen_enabled()) { + if (lru_gen_enabled() || lru_gen_switching()) { lru_gen_age_node(pgdat, sc); - return; + + if (!lru_gen_switching()) + return; + } lruvec = mem_cgroup_lruvec(NULL, pgdat); From cc4555fc6d8f8585ae05f3e117cfdd60e4673281 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Fri, 20 Mar 2026 07:36:45 -0700 Subject: [PATCH 281/369] mm/damon/core: document damos_commit_dests() failure semantics Add a kernel-doc-like comment to damos_commit_dests() documenting its allocation failure contract: on -ENOMEM, the destination structure is left in a partially torn-down state that is safe to deallocate via damon_destroy_scheme(), but must not be reused for further commits. This was unclear from the code alone and led to a separate patch [1] attempting to reset nr_dests on failure. Make the intended usage explicit so future readers do not repeat the confusion. Link: https://lkml.kernel.org/r/20260320143648.91673-1-sj@kernel.org Link: https://lore.kernel.org/20260318214939.36100-1-objecting@objecting.org [1] Signed-off-by: Josh Law Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index f342bee002dc..db6c67e52d2b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1060,6 +1060,23 @@ static void damos_set_filters_default_reject(struct damos *s) damos_filters_default_reject(&s->ops_filters); } +/* + * damos_commit_dests() - Copy migration destinations from @src to @dst. + * @dst: Destination structure to update. + * @src: Source structure to copy from. + * + * If the number of destinations has changed, the old arrays in @dst are freed + * and new ones are allocated. On success, @dst contains a full copy of + * @src's arrays and count. + * + * On allocation failure, @dst is left in a partially torn-down state: its + * arrays may be NULL and @nr_dests may not reflect the actual allocation + * sizes. The structure remains safe to deallocate via damon_destroy_scheme(), + * but callers must not reuse @dst for further commits — it should be + * discarded. + * + * Return: 0 on success, -ENOMEM on allocation failure. + */ static int damos_commit_dests(struct damos_migrate_dests *dst, struct damos_migrate_dests *src) { From 6f1e1823875f59591df1159b7d193b40337ef524 Mon Sep 17 00:00:00 2001 From: Liew Rui Yan Date: Fri, 20 Mar 2026 13:24:28 +0800 Subject: [PATCH 282/369] Docs/mm/damon: document min_nr_regions constraint and rationale The current DAMON implementation requires 'min_nr_regions' to be at least 3. However, this constraint is not explicitly documented in the admin-guide documents, nor is its design rationale explained in the design document. Add a section in design.rst to explain the rationale: the virtual address space monitoring design needs to handle at least three regions to accommodate two large unmapped areas. While this is specific to 'vaddr', DAMON currently enforces it across all operation sets for consistency. Also update reclaim.rst and lru_sort.rst by adding cross-references to this constraint within their respective 'min_nr_regions' parameter description sections, ensuring users are aware of the lower bound. This change is motivated from a recent discussion [1]. Link: https://lkml.kernel.org/r/20260320052428.213230-1-aethernet65535@gmail.com Link: https://lore.kernel.org/damon/20260319151528.86490-1-sj@kernel.org/T/#t [1] Signed-off-by: Liew Rui Yan Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/lru_sort.rst | 4 ++++ Documentation/admin-guide/mm/damon/reclaim.rst | 4 ++++ Documentation/mm/damon/design.rst | 7 +++++++ 3 files changed, 15 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst index 56690646cf39..a7dea7c75a9b 100644 --- a/Documentation/admin-guide/mm/damon/lru_sort.rst +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -221,6 +221,10 @@ But, setting this too high could result in increased monitoring overhead. Please refer to the DAMON documentation (:doc:`usage`) for more detail. 10 by default. +Note that this must be 3 or higher. Please refer to the :ref:`Monitoring +` section of the design document for the rationale +behind this lower bound. + max_nr_regions -------------- diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 442ac5c64795..47854c461706 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -204,6 +204,10 @@ monitoring. This can be used to set lower-bound of the monitoring quality. But, setting this too high could result in increased monitoring overhead. Please refer to the DAMON documentation (:doc:`usage`) for more detail. +Note that this must be 3 or higher. Please refer to the :ref:`Monitoring +` section of the design document for the rationale +behind this lower bound. + max_nr_regions -------------- diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 838b14d22519..afc7d52bda2f 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -167,6 +167,13 @@ monitoring attributes, ``sampling interval``, ``aggregation interval``, ``update interval``, ``minimum number of regions``, and ``maximum number of regions``. +Note that ``minimum number of regions`` must be 3 or higher. This is because the +virtual address space monitoring is designed to handle at least three regions to +accommodate two large unmapped areas commonly found in normal virtual address +spaces. While this restriction might not be strictly necessary for other +operation sets like ``paddr``, it is currently enforced across all DAMON +operations for consistency. + To know how user-space can set the attributes via :ref:`DAMON sysfs interface `, refer to :ref:`monitoring_attrs ` part of the documentation. From 1871d548fc4feb007644efb6d669c93a4e191254 Mon Sep 17 00:00:00 2001 From: Hubert Mazur Date: Fri, 20 Mar 2026 07:57:23 +0000 Subject: [PATCH 283/369] mm/execmem: make the populate and alloc atomic When a block of memory is requested from the execmem manager it tries to find a suitable fragment by traversing the free_areas. In case there is no such block, a new memory area is added to the free_areas and then allocated to the caller by traversing the free_area tree again. The above operations of allocation and tree traversal are not atomic hence another request may consume this newly allocated memory block which results in the allocation failure for the original request. Such occurrence can be spotted on devices running the 6.18 kernel during the parallel modules loading. To mitigate such resource races execute the cache population and allocation operations under one mutex lock. Link: https://lkml.kernel.org/r/20260320075723.779985-1-hmazur@google.com Signed-off-by: Hubert Mazur Reviewed-by: Mike Rapoport (Microsoft) Cc: Greg Kroah-Hartman Cc: Stanislaw Kardach Cc: Michal Krawczyk Cc: Slawomir Rosek Cc: Hubert Mazur Signed-off-by: Andrew Morton --- mm/execmem.c | 55 +++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/mm/execmem.c b/mm/execmem.c index 810a4ba9c924..084a207e4278 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -203,13 +203,6 @@ static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) return mas_store_gfp(&mas, (void *)lower, gfp_mask); } -static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) -{ - guard(mutex)(&execmem_cache.mutex); - - return execmem_cache_add_locked(ptr, size, gfp_mask); -} - static bool within_range(struct execmem_range *range, struct ma_state *mas, size_t size) { @@ -225,18 +218,16 @@ static bool within_range(struct execmem_range *range, struct ma_state *mas, return false; } -static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) +static void *execmem_cache_alloc_locked(struct execmem_range *range, size_t size) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *busy_areas = &execmem_cache.busy_areas; MA_STATE(mas_free, free_areas, 0, ULONG_MAX); MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); - struct mutex *mutex = &execmem_cache.mutex; unsigned long addr, last, area_size = 0; void *area, *ptr = NULL; int err; - mutex_lock(mutex); mas_for_each(&mas_free, area, ULONG_MAX) { area_size = mas_range_len(&mas_free); @@ -245,7 +236,7 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) } if (area_size < size) - goto out_unlock; + return NULL; addr = mas_free.index; last = mas_free.last; @@ -254,7 +245,7 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) mas_set_range(&mas_busy, addr, addr + size - 1); err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); if (err) - goto out_unlock; + return NULL; mas_store_gfp(&mas_free, NULL, GFP_KERNEL); if (area_size > size) { @@ -268,19 +259,25 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); if (err) { mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); - goto out_unlock; + return NULL; } } ptr = (void *)addr; -out_unlock: - mutex_unlock(mutex); return ptr; } -static int execmem_cache_populate(struct execmem_range *range, size_t size) +static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + guard(mutex)(&execmem_cache.mutex); + + return execmem_cache_alloc_locked(range, size); +} + +static void *execmem_cache_populate_alloc(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + struct mutex *mutex = &execmem_cache.mutex; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -294,7 +291,7 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) } if (!p) - return err; + return NULL; vm = find_vm_area(p); if (!vm) @@ -307,33 +304,39 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) if (err) goto err_free_mem; - err = execmem_cache_add(p, alloc_size, GFP_KERNEL); + /* + * New memory blocks must be allocated and added to the cache + * as an atomic operation, otherwise they may be consumed + * by a parallel call to the execmem_cache_alloc function. + */ + mutex_lock(mutex); + err = execmem_cache_add_locked(p, alloc_size, GFP_KERNEL); if (err) goto err_reset_direct_map; - return 0; + p = execmem_cache_alloc_locked(range, size); + + mutex_unlock(mutex); + + return p; err_reset_direct_map: + mutex_unlock(mutex); execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); - return err; + return NULL; } static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { void *p; - int err; p = __execmem_cache_alloc(range, size); if (p) return p; - err = execmem_cache_populate(range, size); - if (err) - return NULL; - - return __execmem_cache_alloc(range, size); + return execmem_cache_populate_alloc(range, size); } static inline bool is_pending_free(void *ptr) From 4fb61d95ad21c3b6f1c09f357ff49d70abb0535e Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sat, 21 Mar 2026 22:29:11 +0900 Subject: [PATCH 284/369] mm/zsmalloc: copy KMSAN metadata in zs_page_migrate() zs_page_migrate() uses copy_page() to copy the contents of a zspage page during migration. However, copy_page() is not instrumented by KMSAN, so the shadow and origin metadata of the destination page are not updated. As a result, subsequent accesses to the migrated page are reported as use-after-free by KMSAN, despite the data being correctly copied. Add a kmsan_copy_page_meta() call after copy_page() to propagate the KMSAN metadata to the new page, matching what copy_highpage() does internally. Link: https://lkml.kernel.org/r/20260321132912.93434-1-syoshida@redhat.com Fixes: afb2d666d025 ("zsmalloc: use copy_page for full page copy") Signed-off-by: Shigeru Yoshida Reviewed-by: Sergey Senozhatsky Cc: Mark-PK Tsai Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e7417ece1c12..63128ddb7959 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1753,6 +1753,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, */ d_addr = kmap_local_zpdesc(newzpdesc); copy_page(d_addr, s_addr); + kmsan_copy_page_meta(zpdesc_page(newzpdesc), zpdesc_page(zpdesc)); kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; From c4a9439a5a372c6c0eb7cd2bc9dbb2494699e98d Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Sat, 21 Mar 2026 20:08:47 +0800 Subject: [PATCH 285/369] mm: mark early-init static variables with __meminitdata Static variables defined inside __meminit functions should also be marked with __meminitdata, so that their storage is placed in the .init.data section and reclaimed with free_initmem(), thereby reducing permanent .bss memory usage when CONFIG_MEMORY_HOTPLUG is disabled. Link: https://lkml.kernel.org/r/20260321120847.8159-1-pilgrimtao@gmail.com Signed-off-by: Kaitao Cheng Reviewed-by: Andrew Morton Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 +- mm/sparse-vmemmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 4324b93ccebd..79f93f2a90cf 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -812,7 +812,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { - static struct memblock_region *r; + static struct memblock_region *r __meminitdata; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 24a37676cecb..6eadb9d116e4 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -62,7 +62,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (slab_is_available()) { gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; int order = get_order(size); - static bool warned; + static bool warned __meminitdata; struct page *page; page = alloc_pages_node(node, gfp_mask, order); From 3cb0dc0d0eab18d6ef738e10d5634e3a71121044 Mon Sep 17 00:00:00 2001 From: Kexin Sun Date: Sat, 21 Mar 2026 18:58:20 +0800 Subject: [PATCH 286/369] mm: vmalloc: update outdated comment for renamed vread() The function vread() was renamed to vread_iter() in commit 4c91c07c93bb ("mm: vmalloc: convert vread() to vread_iter()"), converting from a buffer-based to an iterator-based interface. Update the kdoc of vread_iter() to reflect the new interface: replace references to @buf with @iter, drop the stale "kernel's buffer" requirement, and update the self-reference from vread() to vread_iter(). Also update the stale vread() reference in pstore's ram_core.c. Assisted-by: unnamed:deepseek-v3.2 coccinelle Link: https://lkml.kernel.org/r/20260321105820.7134-1-kexinsun@smail.nju.edu.cn Signed-off-by: Kexin Sun Reviewed-by: Andrew Morton Cc: "Guilherme G. Piccoli" Cc: Julia Lawall Cc: Kees Cook Cc: Tony Luck Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton --- fs/pstore/ram_core.c | 2 +- mm/vmalloc.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index ed97494abf60..738283a85ea2 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -450,7 +450,7 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, pages[i] = pfn_to_page(addr >> PAGE_SHIFT); } /* - * VM_IOREMAP used here to bypass this region during vread() + * VM_IOREMAP used here to bypass this region during vread_iter() * and kmap_atomic() (i.e. kcore) to avoid __va() failures. */ vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c607307c657a..b31b208f6ecb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4575,20 +4575,20 @@ finished: * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and - * copy data from that area to a given buffer. If the given memory range + * copies data from that area to a given iterator. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to - * proper area of @buf. If there are memory holes, they'll be zero-filled. + * proper area of @iter. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. + * vm_struct area, returns 0. * - * Note: In usual ops, vread() is never necessary because the caller + * Note: In usual ops, vread_iter() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * - * Return: number of bytes for which addr and buf should be increased + * Return: number of bytes for which addr and iter should be advanced * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ From f2a48f8fb56466488f7525e3349d4e2f45dc5d74 Mon Sep 17 00:00:00 2001 From: Kexin Sun Date: Sat, 21 Mar 2026 18:58:14 +0800 Subject: [PATCH 287/369] mm: update outdated comments for removed scan_swap_map_slots() The function scan_swap_map_slots() was removed in commit 0ff67f990bd4 ("mm, swap: remove swap slot cache"). The three comments referencing it simply noted that ->flags can be updated non-atomically by scan_swap_map_slots() to justify a data_race() annotation. Since the function no longer exists, drop the parenthetical reference while keeping the data_race() justification intact: ->flags can still be updated non-atomically by other paths (e.g., swapoff clearing SWP_WRITEOK). Assisted-by: unnamed:deepseek-v3.2 coccinelle Link: https://lkml.kernel.org/r/20260321105814.7053-1-kexinsun@smail.nju.edu.cn Signed-off-by: Kexin Sun Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Axel Rasmussen Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Johannes Weiner Cc: Julia Lawall Cc: Kairui Song Cc: Kemeng Shi Cc: Michal Hocko Cc: Nhat Pham Cc: Qi Zheng Cc: Shakeel Butt Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/page_io.c | 4 ++-- mm/vmscan.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index a2c034660c80..330abc5ab7b4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -450,14 +450,14 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* - * ->flags can be updated non-atomically (scan_swap_map_slots), + * ->flags can be updated non-atomically, * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(folio, swap_plug); /* - * ->flags can be updated non-atomically (scan_swap_map_slots), + * ->flags can be updated non-atomically, * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 42f834c508bc..4bf091b1c8af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1065,7 +1065,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. - * ->flags can be updated non-atomically (scan_swap_map_slots), + * ->flags can be updated non-atomically, * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ From a62ca3f40feaaaf0dfc4db1f2edeca5a70f4123d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:49 +0800 Subject: [PATCH 288/369] mm: change to return bool for ptep_test_and_clear_young() Patch series "change young flag check functions to return bool", v2. This is a cleanup patchset to change all young flag check functions to return bool, as discussed with David in the previous thread[1]. Since callers only care about whether the young flag was set, returning bool makes the intention clearer. No functional changes intended. This patch (of 6): Callers use ptep_test_and_clear_young() to clear the young flag and check whether it was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/cover.1774075004.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/57e70efa9703d43959aa645246ea3cbdba14fa17.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 16 +++++++--------- arch/arm64/mm/contpte.c | 7 +++---- arch/microblaze/include/asm/pgtable.h | 2 +- arch/parisc/include/asm/pgtable.h | 7 ++++--- arch/powerpc/include/asm/book3s/32/pgtable.h | 4 ++-- arch/powerpc/include/asm/book3s/64/pgtable.h | 6 +++--- arch/powerpc/include/asm/nohash/pgtable.h | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/riscv/mm/pgtable.c | 7 +++---- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 6 +++--- arch/xtensa/include/asm/pgtable.h | 9 ++++----- include/linux/pgtable.h | 16 ++++++++-------- 14 files changed, 46 insertions(+), 50 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index ab451d20e4c5..79596cc05dcb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1282,9 +1282,8 @@ static inline void __pte_clear(struct mm_struct *mm, __set_pte(ptep, __pte(0)); } -static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline bool __ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { pte_t old_pte, pte; @@ -1646,7 +1645,7 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full); -int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, +bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); @@ -1813,9 +1812,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #define test_and_clear_young_ptes test_and_clear_young_ptes -static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) return __ptep_test_and_clear_young(vma, addr, ptep); @@ -1824,8 +1822,8 @@ static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { return test_and_clear_young_ptes(vma, addr, ptep, 1); } diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 1519d090d5ea..a31cae78f712 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -508,9 +508,8 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); -int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { /* * ptep_clear_flush_young() technically requires us to clear the access @@ -525,7 +524,7 @@ int contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, */ unsigned long end = addr + nr * PAGE_SIZE; - int young = 0; + bool young = false; ptep = contpte_align_addr_ptep(&addr, &end, ptep, nr); for (; addr != end; ptep++, addr += PAGE_SIZE) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index ea72291de553..7678c040a2fd 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -318,7 +318,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG struct vm_area_struct; -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { return (pte_update(ptep, _PAGE_ACCESSED, 0) & _PAGE_ACCESSED) != 0; diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index f6fb99cb94d9..7097c785f690 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -438,16 +438,17 @@ static inline pte_t ptep_get(pte_t *ptep) } #define ptep_get ptep_get -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte; pte = ptep_get(ptep); if (!pte_young(pte)) { - return 0; + return false; } set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte)); - return 1; + return true; } int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 001e28f9eabc..4a271318dee8 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -295,8 +295,8 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p * for our hash-based implementation, we fix that up here. */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +static inline bool __ptep_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { unsigned long old; old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 1a91762b455d..c049a2e26e25 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -349,13 +349,13 @@ static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same * function for both hash and radix. */ -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +static inline bool __ptep_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { unsigned long old; if ((pte_raw(*ptep) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; + return false; old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old & _PAGE_ACCESSED) != 0; } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index e6da5eaccff6..3a6f20a1c800 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -101,8 +101,8 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p } #endif -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { unsigned long old; diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ab4ce1cc9d9c..643d12481b02 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -659,8 +659,8 @@ static inline void pte_clear(struct mm_struct *mm, extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG /* defined in mm/pgtable.c */ -extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep); +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index b1ed2f14dc3a..9c4427d0b187 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -29,12 +29,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return true; } -int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { if (!pte_young(ptep_get(ptep))) - return 0; + return false; return test_and_clear_bit(_PAGE_ACCESSED_OFFSET, &pte_val(*ptep)); } EXPORT_SYMBOL_GPL(ptep_test_and_clear_young); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1c3c3be93be9..ef4748ee3a2b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1164,8 +1164,8 @@ pte_t ptep_xchg_direct(struct mm_struct *, unsigned long, pte_t *, pte_t); pte_t ptep_xchg_lazy(struct mm_struct *, unsigned long, pte_t *, pte_t); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 54289f4587a4..1d86fb33239f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1232,8 +1232,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty); #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -extern int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH extern int ptep_clear_flush_young(struct vm_area_struct *vma, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 2e5ecfdce73c..5ee38dda9124 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -443,10 +443,10 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, } #endif -int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { - int ret = 0; + bool ret = false; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 61f07d981a94..f00a879dc298 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -304,15 +304,14 @@ set_pmd(pmd_t *pmdp, pmd_t pmdval) struct vm_area_struct; -static inline int -ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; if (!pte_young(pte)) - return 0; + return false; update_pte(ptep, pte_mkold(pte)); - return 1; + return true; } static inline pte_t diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 17d961c612fc..8e75dc9f7932 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -491,17 +491,17 @@ static inline pgd_t pgdp_get(pgd_t *pgdp) #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) +static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); - int r = 1; + bool young = true; + if (!pte_young(pte)) - r = 0; + young = false; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); - return r; + return young; } #endif @@ -1123,10 +1123,10 @@ static inline int clear_flush_young_ptes(struct vm_area_struct *vma, * * Returns: whether any PTE was young. */ -static inline int test_and_clear_young_ptes(struct vm_area_struct *vma, +static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young = 0; + bool young = false; for (;;) { young |= ptep_test_and_clear_young(vma, addr, ptep); From 06c4dfa3ced61635895d0e258da8dc63da539f42 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:50 +0800 Subject: [PATCH 289/369] mm: change to return bool for ptep_clear_flush_young()/clear_flush_young_ptes() The ptep_clear_flush_young() and clear_flush_young_ptes() are used to clear the young flag and flush the TLB, returning whether the young flag was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/24af5144b96103631594501f77d4525f2475c1be.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 17 ++++++++--------- arch/arm64/mm/contpte.c | 7 +++---- arch/parisc/include/asm/pgtable.h | 2 +- arch/parisc/kernel/cache.c | 8 ++++---- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 4 ++-- include/linux/pgtable.h | 8 ++++---- mm/pgtable-generic.c | 7 ++++--- 11 files changed, 33 insertions(+), 34 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 79596cc05dcb..1009f719b157 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1298,10 +1298,10 @@ static inline bool __ptep_test_and_clear_young(struct vm_area_struct *vma, return pte_young(pte); } -static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline bool __ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - int young = __ptep_test_and_clear_young(vma, address, ptep); + bool young = __ptep_test_and_clear_young(vma, address, ptep); if (young) { /* @@ -1647,7 +1647,7 @@ extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned int nr, int full); bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); -int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, +bool contpte_clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr); extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr); @@ -1829,8 +1829,8 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t orig_pte = __ptep_get(ptep); @@ -1841,9 +1841,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, } #define clear_flush_young_ptes clear_flush_young_ptes -static inline int clear_flush_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +static inline bool clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) return __ptep_clear_flush_young(vma, addr, ptep); diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index a31cae78f712..2dc1b8ad71e8 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -534,11 +534,10 @@ bool contpte_test_and_clear_young_ptes(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(contpte_test_and_clear_young_ptes); -int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - unsigned int nr) +bool contpte_clear_flush_young_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { - int young; + bool young; young = contpte_test_and_clear_young_ptes(vma, addr, ptep, nr); diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 7097c785f690..467b8547ac8b 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -451,7 +451,7 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, return true; } -int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +bool ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); struct mm_struct; diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index b189265785dc..0170b69a21d3 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -781,18 +781,18 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned lon __flush_cache_page(vma, vmaddr, PFN_PHYS(page_to_pfn(page))); } -int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { pte_t pte = ptep_get(ptep); if (!pte_young(pte)) - return 0; + return false; set_pte(ptep, pte_mkold(pte)); #if CONFIG_FLUSH_PAGE_ACCESSED __flush_cache_page(vma, addr, PFN_PHYS(pte_pfn(pte))); #endif - return 1; + return true; } /* diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 2deb955b7bc8..661eb3820d12 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -155,7 +155,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH #define ptep_clear_flush_young(__vma, __address, __ptep) \ ({ \ - int __young = ptep_test_and_clear_young(__vma, __address, __ptep);\ + bool __young = ptep_test_and_clear_young(__vma, __address, __ptep);\ __young; \ }) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 643d12481b02..b9dacfc280b1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -695,8 +695,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { /* * This comment is borrowed from x86, but applies equally to RISC-V: diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ef4748ee3a2b..ac74b5076d8f 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1174,8 +1174,8 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { return ptep_test_and_clear_young(vma, address, ptep); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1d86fb33239f..3993657e0a35 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1236,8 +1236,8 @@ bool ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -extern int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep); +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5ee38dda9124..1348384a3bb9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -483,8 +483,8 @@ int pudp_test_and_clear_young(struct vm_area_struct *vma, } #endif -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8e75dc9f7932..99450a3b0705 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -531,8 +531,8 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep); +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH @@ -1086,10 +1086,10 @@ static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, * Context: The caller holds the page table lock. The PTEs map consecutive * pages that belong to the same folio. The PTEs are all in the same PMD. */ -static inline int clear_flush_young_ptes(struct vm_area_struct *vma, +static inline bool clear_flush_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young = 0; + bool young = false; for (;;) { young |= ptep_clear_flush_young(vma, addr, ptep); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index af7966169d69..db0ee918b08a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -81,10 +81,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -int ptep_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +bool ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) { - int young; + bool young; + young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); From 42e26354c4ef28772398b1d71b7477834037305c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:51 +0800 Subject: [PATCH 290/369] mm: change to return bool for pmdp_test_and_clear_young() Callers use pmdp_test_and_clear_young() to clear the young flag and check whether it was set for this PMD entry. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/f1d31307a13365d3d0fed5809727dcc2dd59631b.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 5 ++--- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +++++----- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 6 +++--- include/linux/pgtable.h | 19 +++++++++---------- 8 files changed, 27 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1009f719b157..52bafe79c10a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1320,9 +1320,8 @@ static inline bool __ptep_clear_flush_young(struct vm_area_struct *vma, #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { /* Operation applies to PMD table entry only if FEAT_HAFT is enabled */ VM_WARN_ON(pmd_table(READ_ONCE(*pmdp)) && !system_supports_haft()); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c049a2e26e25..8b354e81ab22 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1161,13 +1161,13 @@ pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too */ -static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) +static inline bool __pmdp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { unsigned long old; if ((pmd_raw(*pmdp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; + return false; old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); return ((old & _PAGE_ACCESSED) != 0); } @@ -1300,8 +1300,8 @@ extern int pudp_set_access_flags(struct vm_area_struct *vma, pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 4b09c04654a8..c584321e3d41 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -98,8 +98,8 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, } -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index b9dacfc280b1..67e7746e3fbe 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1015,8 +1015,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ac74b5076d8f..87a5082da28e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1683,8 +1683,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { pmd_t pmd = *pmdp; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3993657e0a35..ba867bac6096 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1295,8 +1295,8 @@ extern int pudp_set_access_flags(struct vm_area_struct *vma, pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1348384a3bb9..b09e8c5dadf9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -456,10 +456,10 @@ bool ptep_test_and_clear_young(struct vm_area_struct *vma, } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { - int ret = 0; + bool ret = false; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 99450a3b0705..6db900a5d38b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -507,25 +507,24 @@ static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; - int r = 1; + bool young = true; + if (!pmd_young(pmd)) - r = 0; + young = false; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); - return r; + return young; } #else -static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) +static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { BUILD_BUG(); - return 0; + return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif From 2d46a397472191a10b0df294d64da542bfd1de57 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:52 +0800 Subject: [PATCH 291/369] mm: change to return bool for pmdp_clear_flush_young() The pmdp_clear_flush_young() is used to clear the young flag and flush the TLB, returning whether the young flag was set for this PMD entry. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/a668b9a974c0d675e7a41f6973bcbe3336e8b373.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Ritesh Harjani (IBM) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 6 +++--- include/linux/pgtable.h | 10 +++++----- mm/pgtable-generic.c | 7 ++++--- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 87a5082da28e..40a6fb19dd1d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1693,8 +1693,8 @@ static inline bool pmdp_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) +static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) { VM_BUG_ON(addr & ~HPAGE_MASK); return pmdp_test_and_clear_young(vma, addr, pmdp); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ba867bac6096..6c8f2b17d3f9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1301,8 +1301,8 @@ extern int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp); #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b09e8c5dadf9..fc1c996c5b2d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -503,10 +503,10 @@ bool ptep_clear_flush_young(struct vm_area_struct *vma, } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { - int young; + bool young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 6db900a5d38b..cdd68ed3ae1a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -536,18 +536,18 @@ bool ptep_clear_flush_young(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ -static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +static inline bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { BUILD_BUG(); - return 0; + return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index db0ee918b08a..b91b1a98029c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,10 +124,11 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +bool pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { - int young; + bool young; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) From fb87c88272973fa310d36dc60530fd6781bd1a55 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:53 +0800 Subject: [PATCH 292/369] mm: change to return bool for pudp_test_and_clear_young() The pudp_test_and_clear_young() is used to clear the young flag, returning whether the young flag was set for this PUD entry. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/2c56fe52c1bf9404145274d7e91d4a65060f6c7c.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +++++----- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pgtable.c | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8b354e81ab22..60e283cf22be 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1172,13 +1172,13 @@ static inline bool __pmdp_test_and_clear_young(struct mm_struct *mm, return ((old & _PAGE_ACCESSED) != 0); } -static inline int __pudp_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pud_t *pudp) +static inline bool __pudp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) { unsigned long old; if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; + return false; old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0); return ((old & _PAGE_ACCESSED) != 0); } @@ -1303,8 +1303,8 @@ extern int pudp_set_access_flags(struct vm_area_struct *vma, bool pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG -extern int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pud_t *pudp); +bool pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index c584321e3d41..ddc766e95855 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -104,8 +104,8 @@ bool pmdp_test_and_clear_young(struct vm_area_struct *vma, return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } -int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pud_t *pudp) +bool pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) { return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 67e7746e3fbe..a6e0eaba2653 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1109,8 +1109,8 @@ static inline int pudp_set_access_flags(struct vm_area_struct *vma, return ptep_set_access_flags(vma, address, (pte_t *)pudp, pud_pte(entry), dirty); } -static inline int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pud_t *pudp) +static inline bool pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) { return ptep_test_and_clear_young(vma, address, (pte_t *)pudp); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6c8f2b17d3f9..13e3e9a054cb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1297,8 +1297,8 @@ extern int pudp_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG bool pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp); -extern int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pud_t *pudp); +bool pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp); #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH bool pmdp_clear_flush_young(struct vm_area_struct *vma, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index fc1c996c5b2d..da7f0a03cf90 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -470,10 +470,10 @@ bool pmdp_test_and_clear_young(struct vm_area_struct *vma, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pud_t *pudp) +bool pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) { - int ret = 0; + bool ret = false; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, From 1fc7dc675e26c43f3219d70a09b9f0c4aa43a13a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Mar 2026 14:42:54 +0800 Subject: [PATCH 293/369] mm: change to return bool for the MMU notifier's young flag check The MMU notifier young flag check related functions only return whether the young flag was set. Change the return type to bool to make the intention clearer. Link: https://lkml.kernel.org/r/a9ad3fe938002d87358e7bfca264f753ab602561.1774075004.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lorenzo Stoakes (Oracle) Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Ritesh Harjani (IBM) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 76 +++++++++++++++++------------------- mm/internal.h | 16 ++++---- mm/mmu_notifier.c | 23 +++++------ virt/kvm/kvm_main.c | 31 ++++++--------- 4 files changed, 66 insertions(+), 80 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 3705d350c863..17f2cdc77dd5 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -97,20 +97,20 @@ struct mmu_notifier_ops { * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ - int (*clear_flush_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + bool (*clear_flush_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ - int (*clear_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + bool (*clear_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * test_young is called to check the young/accessed bitflag in @@ -118,9 +118,9 @@ struct mmu_notifier_ops { * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ - int (*test_young)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long address); + bool (*test_young)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long address); /* * invalidate_range_start() and invalidate_range_end() must be @@ -376,14 +376,12 @@ mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); -extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end); -extern int __mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end); -extern int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address); +bool __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end); +bool __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end); +bool __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, @@ -403,30 +401,28 @@ static inline void mmu_notifier_release(struct mm_struct *mm) __mmu_notifier_release(mm); } -static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); - return 0; + return false; } -static inline int mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); - return 0; + return false; } -static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +static inline bool mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); - return 0; + return false; } static inline void @@ -552,24 +548,22 @@ static inline void mmu_notifier_release(struct mm_struct *mm) { } -static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { - return 0; + return false; } -static inline int mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline bool mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { - return 0; + return false; } -static inline int mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +static inline bool mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { - return 0; + return false; } static inline void diff --git a/mm/internal.h b/mm/internal.h index 9ae0ee6c34f9..3d3fa35e5fd1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1860,10 +1860,10 @@ static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, } #ifdef CONFIG_MMU_NOTIFIER -static inline int clear_flush_young_ptes_notify(struct vm_area_struct *vma, +static inline bool clear_flush_young_ptes_notify(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young; + bool young; young = clear_flush_young_ptes(vma, addr, ptep, nr); young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, @@ -1871,30 +1871,30 @@ static inline int clear_flush_young_ptes_notify(struct vm_area_struct *vma, return young; } -static inline int pmdp_clear_flush_young_notify(struct vm_area_struct *vma, +static inline bool pmdp_clear_flush_young_notify(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - int young; + bool young; young = pmdp_clear_flush_young(vma, addr, pmdp); young |= mmu_notifier_clear_flush_young(vma->vm_mm, addr, addr + PMD_SIZE); return young; } -static inline int test_and_clear_young_ptes_notify(struct vm_area_struct *vma, +static inline bool test_and_clear_young_ptes_notify(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - int young; + bool young; young = test_and_clear_young_ptes(vma, addr, ptep, nr); young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + nr * PAGE_SIZE); return young; } -static inline int pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, +static inline bool pmdp_test_and_clear_young_notify(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - int young; + bool young; young = pmdp_test_and_clear_young(vma, addr, pmdp); young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 2502474b83b6..dc6f78d559f7 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -364,12 +364,12 @@ void __mmu_notifier_release(struct mm_struct *mm) * unmap the address and return 1 or 0 depending if the mapping previously * existed or not. */ -int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +bool __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_srcu(subscription, @@ -384,12 +384,12 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } -int __mmu_notifier_clear_young(struct mm_struct *mm, - unsigned long start, - unsigned long end) +bool __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_srcu(subscription, @@ -404,11 +404,12 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, return young; } -int __mmu_notifier_test_young(struct mm_struct *mm, - unsigned long address) +bool __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) { struct mmu_notifier *subscription; - int young = 0, id; + bool young = false; + int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_srcu(subscription, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d0ab29672c71..82433f46c438 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -646,11 +646,9 @@ mmu_unlock: return r; } -static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, - unsigned long start, - unsigned long end, - gfn_handler_t handler, - bool flush_on_ret) +static __always_inline bool kvm_age_hva_range(struct mmu_notifier *mn, + unsigned long start, unsigned long end, gfn_handler_t handler, + bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { @@ -666,10 +664,8 @@ static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, return kvm_handle_hva_range(kvm, &range).ret; } -static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn, - unsigned long start, - unsigned long end, - gfn_handler_t handler) +static __always_inline bool kvm_age_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, unsigned long end, gfn_handler_t handler) { return kvm_age_hva_range(mn, start, end, handler, false); } @@ -829,10 +825,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); } -static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) +static bool kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); @@ -840,10 +834,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } -static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) +static bool kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long start, unsigned long end) { trace_kvm_age_hva(start, end); @@ -863,9 +855,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn); } -static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) +static bool kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long address) { trace_kvm_test_age_hva(address); From d885a076d7a74e03c6248fd3951fb9d43c4e7a82 Mon Sep 17 00:00:00 2001 From: Bing Jiao Date: Sat, 21 Mar 2026 03:34:13 +0000 Subject: [PATCH 294/369] mm/memcontrol: fix reclaim_options leak in try_charge_memcg() In try_charge_memcg(), the 'reclaim_options' variable is initialized once at the start of the function. However, the function contains a retry loop. If reclaim_options were modified during an iteration (e.g., by encountering a memsw limit), the modified state would persist into subsequent retries. This leads to incorrect reclaim behavior. Specifically, MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit is reached. After reclaimation attempts, a subsequent retry may successfully charge memcg->memsw but fail on the memcg->memory charge. In this case, swapping should be permitted, but the carried-over state prevents it. This issue was identified during code reading of try_charge_memcg() while analyzing memsw limit behavior in tiered-memory systems; no production failures have been reported yet. Fix by moving the initialization of 'reclaim_options' inside the retry loop, ensuring a clean state for every reclaim attempt. Link: https://lkml.kernel.org/r/20260321033500.2558070-1-bingjiao@google.com Fixes: 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()") Signed-off-by: Bing Jiao Reviewed-by: Yosry Ahmed Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Axel Rasmussen Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: David Rientjes Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lorenzo Stoakes (Oracle) Cc: Muchun Song Cc: Nhat Pham Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47bf034d4b93..051b82ebf371 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2385,7 +2385,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; unsigned long nr_reclaimed; bool passed_oom = false; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; + unsigned int reclaim_options; bool drained = false; bool raised_max_event = false; unsigned long pflags; @@ -2399,6 +2399,7 @@ retry: /* Avoid the refill and flush of the older stock */ batch = nr_pages; + reclaim_options = MEMCG_RECLAIM_MAY_SWAP; if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) From b480cbb071020bd590a0dc0166635b448e9fc46b Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 20 Mar 2026 17:34:25 +0000 Subject: [PATCH 295/369] mm/page_alloc: don't increase highatomic reserve after pcp alloc Higher order GFP_ATOMIC allocations can be served through a PCP list with ALLOC_HIGHATOMIC set. Such an allocation can e.g. happen if a zone is between the low and min watermarks, and get_page_from_freelist is retried after the alloc_flags are relaxed. The call to reserve_highatomic_pageblock() after such a PCP allocation will result in an increase every single time: the page from the (unmovable) PCP list will never have migrate type MIGRATE_HIGHATOMIC, since MIGRATE_HIGHATOMIC pages do not appear on the unmovable PCP list. So a new pageblock is converted to MIGRATE_HIGHATOMIC. Eventually that leads to the maximum of 1% of the zone being used up by (often mostly free) MIGRATE_HIGHATOMIC pageblocks, for no good reason. Since this space is not available for normal allocations, this wastes memory and will push things in to reclaim too soon. This was observed on a system that ran a test with bursts of memory activity, paired with GFP_ATOMIC SLUB activity. These would lead to a new slab being allocated with GFP_ATOMIC, sometimes hitting the get_page_from_freelist retry path by being below the low watermark. While the frequency of those allocations was low, it kept adding up over time, and the number of MIGRATE_ATOMIC pageblocks kept increasing. If a higher order atomic allocation can be served by the unmovable PCP list, there is probably no need yet to extend the reserves. So, move the check and possible extension of the highatomic reserves to the buddy case only, and do not refill the PCP list for ALLOC_HIGHATOMIC if it's empty. This way, the PCP list is tried for ALLOC_HIGHATOMIC for a fast atomic allocation. But it will immediately fall back to rmqueue_buddy() if it's empty. In rmqueue_buddy(), the MIGRATE_HIGHATOMIC buddy lists are tried first (as before), and the reserves are extended only if that fails. With this change, the test was stable. Highatomic reserves were built up, but to a normal level. No highatomic failures were seen. This is similar to the patch proposed in [1] by Zhiguo Jiang, but re-arranged a bit. Link: https://lkml.kernel.org/r/20260320173426.1831267-1-fvdl@google.com Link: https://lore.kernel.org/all/20231122013925.1507-1-justinjiang@vivo.com/ [1] Fixes: 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") Signed-off-by: Zhiguo Jiang Signed-off-by: Frank van der Linden Reviewed-by: Vlastimil Babka (SUSE) Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zhiguo Jiang Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdde59e56a55..111b54df8a3c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -207,6 +207,8 @@ unsigned int pageblock_order __read_mostly; static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags); +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -3239,6 +3241,13 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) + reserve_highatomic_pageblock(page, order, zone); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -3310,6 +3319,20 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, int batch = nr_pcp_alloc(pcp, zone, order); int alloced; + /* + * Don't refill the list for a higher order atomic + * allocation under memory pressure, as this would + * not build up any HIGHATOMIC reserves, which + * might be needed soon. + * + * Instead, direct it towards the reserves by + * returning NULL, which will make the caller fall + * back to rmqueue_buddy. This will try to use the + * reserves first and grow them if needed. + */ + if (alloc_flags & ALLOC_HIGHATOMIC) + return NULL; + alloced = rmqueue_bulk(zone, order, batch, list, migratetype, alloc_flags); @@ -3924,13 +3947,6 @@ try_this_zone: if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); - /* - * If this is a high-order atomic allocation then check - * if the pageblock should be reserved for the future - */ - if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, order, zone); - return page; } else { if (cond_accept_memory(zone, order, alloc_flags)) From 3f74e30d857bd546c32eba78739dc85dabc6aae1 Mon Sep 17 00:00:00 2001 From: Kexin Sun Date: Sat, 21 Mar 2026 18:57:04 +0800 Subject: [PATCH 296/369] drivers/base/memory: fix stale reference to memory_block_add_nid() The function memory_block_add_nid() was renamed to memory_block_add_nid_early() by commit 0a947c14e48c ("drivers/base: move memory_block_add_nid() into the caller"). Update the stale reference in add_memory_block(). Assisted-by: unnamed:deepseek-v3.2 coccinelle Link: https://lkml.kernel.org/r/20260321105704.6093-1-kexinsun@smail.nju.edu.cn Signed-off-by: Kexin Sun Reviewed-by: David Hildenbrand (Arm) Reviewed by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5380050b16b7..f806a683b767 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -815,7 +815,7 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state /* * MEM_ONLINE at this point implies early memory. With NUMA, * we'll determine the zone when setting the node id via - * memory_block_add_nid(). Memory hotplug updated the zone + * memory_block_add_nid_early(). Memory hotplug updated the zone * manually when memory onlining/offlining succeeds. */ mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); From 54fdcbfe1cbd1d8f06d0c57c8cc43ddcc1cd421c Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 23 Mar 2026 17:03:04 +0800 Subject: [PATCH 297/369] mm: remove unused page_is_file_lru() function The page_is_file_lru() wrapper function is no longer used. The kernel has moved to folio-based APIs, and all callers should use folio_is_file_lru() instead. Remove the obsolete page-based wrapper function. Link: https://lkml.kernel.org/r/20260323090305.798057-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 2aedcff6a2c1..7fc2ced00f8f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -30,11 +30,6 @@ static inline int folio_is_file_lru(const struct folio *folio) return !folio_test_swapbacked(folio); } -static inline int page_is_file_lru(struct page *page) -{ - return folio_is_file_lru(page_folio(page)); -} - static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) From 224f1292615079d604651915a214f9e5ace9e41c Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 23 Mar 2026 12:37:17 -0400 Subject: [PATCH 298/369] selftests/mm: add folio_split() and filemap_get_entry() race test The added folio_split_race_test is a modified C port of the race condition test from [1]. The test creates shmem huge pages, where the main thread punches holes in the shmem to cause folio_split() in the kernel and a set of 16 threads reads the shmem to cause filemap_get_entry() in the kernel. filemap_get_entry() reads the folio and xarray split by folio_split() locklessly. The original test[2] is written in rust and uses memfd (shmem backed). This C port uses shmem directly and use a single process. Note: the initial rust to C conversion is done by Cursor. Link: https://lore.kernel.org/all/CAKNNEtw5_kZomhkugedKMPOG-sxs5Q5OLumWJdiWXv+C9Yct0w@mail.gmail.com/ [1] Link: https://github.com/dfinity/thp-madv-remove-test [2] Link: https://lkml.kernel.org/r/20260323163717.184107-1-ziy@nvidia.com Co-developed-by: Bas van Dijk Signed-off-by: Bas van Dijk Co-developed-by: Adam Bratschi-Kaye Signed-off-by: Adam Bratschi-Kaye Signed-off-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Hugh Dickins Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + .../selftests/mm/folio_split_race_test.c | 297 ++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 2 + 4 files changed, 301 insertions(+) create mode 100644 tools/testing/selftests/mm/folio_split_race_test.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 83ad9454dd9d..b0c30c5ee9e3 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -61,3 +61,4 @@ guard-regions merge prctl_thp_disable rmap +folio_split_race_test diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 7a5de4e9bf52..cd24596cdd27 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -105,6 +105,7 @@ TEST_GEN_FILES += droppable TEST_GEN_FILES += guard-regions TEST_GEN_FILES += merge TEST_GEN_FILES += rmap +TEST_GEN_FILES += folio_split_race_test ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/folio_split_race_test.c b/tools/testing/selftests/mm/folio_split_race_test.c new file mode 100644 index 000000000000..ff026f183ac7 --- /dev/null +++ b/tools/testing/selftests/mm/folio_split_race_test.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The test creates shmem PMD huge pages, fills all pages with known patterns, + * then continuously verifies non-punched pages with 16 threads. Meanwhile, the + * main thread punches holes via MADV_REMOVE on the shmem. + * + * It tests the race condition between folio_split() and filemap_get_entry(), + * where the hole punches on shmem lead to folio_split() and reading the shmem + * lead to filemap_get_entry(). + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vm_util.h" +#include "kselftest.h" +#include "thp_settings.h" + +uint64_t page_size; +uint64_t pmd_pagesize; +#define NR_PMD_PAGE 5 +#define FILE_SIZE (pmd_pagesize * NR_PMD_PAGE) +#define TOTAL_PAGES (FILE_SIZE / page_size) + +/* Every N-th to N+M-th pages are punched; not aligned with huge page boundaries. */ +#define PUNCH_INTERVAL 50 /* N */ +#define PUNCH_SIZE_FACTOR 3 /* M */ + +#define NUM_READER_THREADS 16 +#define FILL_BYTE 0xAF +#define NUM_ITERATIONS 100 + +/* Shared control block: control reading threads and record stats */ +struct shared_ctl { + atomic_uint_fast32_t stop; + atomic_uint_fast64_t reader_failures; + atomic_uint_fast64_t reader_verified; + pthread_barrier_t barrier; +}; + +static void fill_page(unsigned char *base, size_t page_idx) +{ + unsigned char *page_ptr = base + page_idx * page_size; + uint64_t idx = (uint64_t)page_idx; + + memset(page_ptr, FILL_BYTE, page_size); + memcpy(page_ptr, &idx, sizeof(idx)); +} + +/* Returns true if valid, false if corrupted. */ +static bool check_page(unsigned char *base, uint64_t page_idx) +{ + unsigned char *page_ptr = base + page_idx * page_size; + uint64_t expected_idx = (uint64_t)page_idx; + uint64_t got_idx; + + memcpy(&got_idx, page_ptr, 8); + + if (got_idx != expected_idx) { + uint64_t off; + int all_zero = 1; + + for (off = 0; off < page_size; off++) { + if (page_ptr[off] != 0) { + all_zero = 0; + break; + } + } + if (all_zero) { + ksft_print_msg("CORRUPTED: page %" PRIu64 + " (huge page %" PRIu64 + ") is ALL ZEROS\n", + page_idx, + (page_idx * page_size) / pmd_pagesize); + } else { + ksft_print_msg("CORRUPTED: page %" PRIu64 + " (huge page %" PRIu64 + "): expected idx %" PRIu64 + ", got %" PRIu64 "\n", + page_idx, + (page_idx * page_size) / pmd_pagesize, + page_idx, got_idx); + } + return false; + } + return true; +} + +struct reader_arg { + unsigned char *base; + struct shared_ctl *ctl; + int tid; + atomic_uint_fast64_t *failures; + atomic_uint_fast64_t *verified; +}; + +static void *reader_thread(void *arg) +{ + struct reader_arg *ra = (struct reader_arg *)arg; + unsigned char *base = ra->base; + struct shared_ctl *ctl = ra->ctl; + int tid = ra->tid; + atomic_uint_fast64_t *failures = ra->failures; + atomic_uint_fast64_t *verified = ra->verified; + uint64_t page_idx; + + pthread_barrier_wait(&ctl->barrier); + + while (atomic_load_explicit(&ctl->stop, memory_order_acquire) == 0) { + for (page_idx = (size_t)tid; page_idx < TOTAL_PAGES; + page_idx += NUM_READER_THREADS) { + /* + * page_idx % PUNCH_INTERVAL is in [0, PUNCH_INTERVAL), + * skip [0, PUNCH_SIZE_FACTOR) + */ + if (page_idx % PUNCH_INTERVAL < PUNCH_SIZE_FACTOR) + continue; + if (check_page(base, page_idx)) + atomic_fetch_add_explicit(verified, 1, + memory_order_relaxed); + else + atomic_fetch_add_explicit(failures, 1, + memory_order_relaxed); + } + if (atomic_load_explicit(failures, memory_order_relaxed) > 0) + break; + } + + return NULL; +} + +static void create_readers(pthread_t *threads, struct reader_arg *args, + unsigned char *base, struct shared_ctl *ctl) +{ + int i; + + for (i = 0; i < NUM_READER_THREADS; i++) { + args[i].base = base; + args[i].ctl = ctl; + args[i].tid = i; + args[i].failures = &ctl->reader_failures; + args[i].verified = &ctl->reader_verified; + if (pthread_create(&threads[i], NULL, reader_thread, + &args[i]) != 0) + ksft_exit_fail_msg("pthread_create failed\n"); + } +} + +/* Run a single iteration. Returns total number of corrupted pages. */ +static uint64_t run_iteration(void) +{ + uint64_t reader_failures, reader_verified; + struct reader_arg args[NUM_READER_THREADS]; + pthread_t threads[NUM_READER_THREADS]; + unsigned char *mmap_base; + struct shared_ctl ctl; + uint64_t i; + + memset(&ctl, 0, sizeof(struct shared_ctl)); + + mmap_base = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + if (mmap_base == MAP_FAILED) + ksft_exit_fail_msg("mmap failed: %d\n", errno); + + if (madvise(mmap_base, FILE_SIZE, MADV_HUGEPAGE) != 0) + ksft_exit_fail_msg("madvise(MADV_HUGEPAGE) failed: %d\n", + errno); + + for (i = 0; i < TOTAL_PAGES; i++) + fill_page(mmap_base, i); + + if (!check_huge_shmem(mmap_base, NR_PMD_PAGE, pmd_pagesize)) + ksft_exit_fail_msg("No shmem THP is allocated\n"); + + if (pthread_barrier_init(&ctl.barrier, NULL, NUM_READER_THREADS + 1) != 0) + ksft_exit_fail_msg("pthread_barrier_init failed\n"); + + create_readers(threads, args, mmap_base, &ctl); + + /* Wait for all reader threads to be ready before punching holes. */ + pthread_barrier_wait(&ctl.barrier); + + for (i = 0; i < TOTAL_PAGES; i++) { + if (i % PUNCH_INTERVAL != 0) + continue; + if (madvise(mmap_base + i * page_size, + PUNCH_SIZE_FACTOR * page_size, MADV_REMOVE) != 0) { + ksft_exit_fail_msg( + "madvise(MADV_REMOVE) failed on page %" PRIu64 ": %d\n", + i, errno); + } + + i += PUNCH_SIZE_FACTOR - 1; + } + + atomic_store_explicit(&ctl.stop, 1, memory_order_release); + + for (i = 0; i < NUM_READER_THREADS; i++) + pthread_join(threads[i], NULL); + + pthread_barrier_destroy(&ctl.barrier); + + reader_failures = atomic_load_explicit(&ctl.reader_failures, + memory_order_acquire); + reader_verified = atomic_load_explicit(&ctl.reader_verified, + memory_order_acquire); + if (reader_failures) + ksft_print_msg("Child: %" PRIu64 " pages verified, %" PRIu64 " failures\n", + reader_verified, reader_failures); + + munmap(mmap_base, FILE_SIZE); + + return reader_failures; +} + +static void thp_cleanup_handler(int signum) +{ + thp_restore_settings(); + /* + * Restore default handler and re-raise the signal to exit. + * This is to ensure the test process exits with the correct + * status code corresponding to the signal. + */ + signal(signum, SIG_DFL); + raise(signum); +} + +static void thp_settings_cleanup(void) +{ + thp_restore_settings(); +} + +int main(void) +{ + struct thp_settings current_settings; + uint64_t corrupted_pages; + uint64_t iter; + + ksft_print_header(); + + page_size = getpagesize(); + pmd_pagesize = read_pmd_pagesize(); + + if (!thp_available() || !pmd_pagesize) + ksft_exit_skip("Transparent Hugepages not available\n"); + + if (geteuid() != 0) + ksft_exit_skip("Please run the test as root\n"); + + thp_save_settings(); + /* make sure thp settings are restored */ + if (atexit(thp_settings_cleanup) != 0) + ksft_exit_fail_msg("atexit failed\n"); + + signal(SIGINT, thp_cleanup_handler); + signal(SIGTERM, thp_cleanup_handler); + + thp_read_settings(¤t_settings); + current_settings.shmem_enabled = SHMEM_ADVISE; + thp_write_settings(¤t_settings); + + ksft_set_plan(1); + + ksft_print_msg("folio split race test\n"); + + for (iter = 0; iter < NUM_ITERATIONS; iter++) { + corrupted_pages = run_iteration(); + if (corrupted_pages > 0) + break; + } + + if (iter < NUM_ITERATIONS) + ksft_test_result_fail("FAILED on iteration %" PRIu64 + ": %" PRIu64 + " pages corrupted by MADV_REMOVE!\n", + iter, corrupted_pages); + else + ksft_test_result_pass("All %d iterations passed\n", + NUM_ITERATIONS); + + ksft_exit(iter == NUM_ITERATIONS); + + return 0; +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index afdcfd0d7cef..d8468451b3a3 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -515,6 +515,8 @@ if [ -n "${MOUNTED_XFS}" ]; then rm -f ${XFS_IMG} fi +CATEGORY="thp" run_test ./folio_split_race_test + CATEGORY="migration" run_test ./migration CATEGORY="mkdirty" run_test ./mkdirty From 4ff07459db888054f68575646d7fe04f31f1e56d Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 19 Mar 2026 09:25:41 +0800 Subject: [PATCH 299/369] mm/huge_memory: fix folio isn't locked in softleaf_to_folio() On arm64 server, we found folio that get from migration entry isn't locked in softleaf_to_folio(). This issue triggers when mTHP splitting and zap_nonpresent_ptes() races, and the root cause is lack of memory barrier in softleaf_to_folio(). The race is as follows: CPU0 CPU1 deferred_split_scan() zap_nonpresent_ptes() lock folio split_folio() unmap_folio() change ptes to migration entries __split_folio_to_order() softleaf_to_folio() set flags(including PG_locked) for tail pages folio = pfn_folio(softleaf_to_pfn(entry)) smp_wmb() VM_WARN_ON_ONCE(!folio_test_locked(folio)) prep_compound_page() for tail pages In __split_folio_to_order(), smp_wmb() guarantees page flags of tail pages are visible before the tail page becomes non-compound. smp_wmb() should be paired with smp_rmb() in softleaf_to_folio(), which is missed. As a result, if zap_nonpresent_ptes() accesses migration entry that stores tail pfn, softleaf_to_folio() may see the updated compound_head of tail page before page->flags. This issue will trigger VM_WARN_ON_ONCE() in pfn_swap_entry_folio() because of the race between folio split and zap_nonpresent_ptes() leading to a folio incorrectly undergoing modification without a folio lock being held. This is a BUG_ON() before commit 93976a20345b ("mm: eliminate further swapops predicates"), which in merged in v6.19-rc1. To fix it, add missing smp_rmb() if the softleaf entry is migration entry in softleaf_to_folio() and softleaf_to_page(). [tujinjiang@huawei.com: update function name and comments] Link: https://lkml.kernel.org/r/20260321075214.3305564-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20260319012541.4158561-1-tujinjiang@huawei.com Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()") Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Barry Song Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/leafops.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/leafops.h b/include/linux/leafops.h index a9ff94b744f2..05673d3529e7 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -363,6 +363,23 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) return swp_offset(entry) & SWP_PFN_MASK; } +static inline void softleaf_migration_sync(softleaf_t entry, + struct folio *folio) +{ + /* + * Ensure we do not race with split, which might alter tail pages into new + * folios and thus result in observing an unlocked folio. + * This matches the write barrier in __split_folio_to_order(). + */ + smp_rmb(); + + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(!folio_test_locked(folio)); +} + /** * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. @@ -374,11 +391,8 @@ static inline struct page *softleaf_to_page(softleaf_t entry) struct page *page = pfn_to_page(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, page_folio(page)); return page; } @@ -394,12 +408,8 @@ static inline struct folio *softleaf_to_folio(softleaf_t entry) struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked. - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && - !folio_test_locked(folio)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, folio); return folio; } From 9594f05e31e89c795f2161cf9c4aae29c3061f2b Mon Sep 17 00:00:00 2001 From: "Harry Yoo (Oracle)" Date: Fri, 20 Mar 2026 21:59:25 +0900 Subject: [PATCH 300/369] MAINTAINERS, mailmap: update email address for Harry Yoo Update my email address to harry@kernel.org. Link: https://lkml.kernel.org/r/20260320125925.2259998-1-harry@kernel.org Signed-off-by: Harry Yoo (Oracle) Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 7d14504daf24..2d04aeba68b4 100644 --- a/.mailmap +++ b/.mailmap @@ -316,6 +316,7 @@ Hans Verkuil Hans Verkuil Hao Ge Harry Yoo <42.hyeyoo@gmail.com> +Harry Yoo Heiko Carstens Heiko Carstens Heiko Stuebner diff --git a/MAINTAINERS b/MAINTAINERS index cf654eba46ee..76431aa5efbe 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16879,7 +16879,7 @@ M: Lorenzo Stoakes R: Rik van Riel R: Liam R. Howlett R: Vlastimil Babka -R: Harry Yoo +R: Harry Yoo R: Jann Horn L: linux-mm@kvack.org S: Maintained @@ -24348,7 +24348,7 @@ F: drivers/nvmem/layouts/sl28vpd.c SLAB ALLOCATOR M: Vlastimil Babka -M: Harry Yoo +M: Harry Yoo M: Andrew Morton R: Hao Li R: Christoph Lameter From 9acbe135588e25070e963c0f066019cbeeb30c07 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 20 Mar 2026 06:05:59 +0100 Subject: [PATCH 301/369] mm/swap: fix swap cache memcg accounting The swap readahead path was recently refactored and while doing this, the order between the charging of the folio in the memcg and the addition of the folio in the swap cache was inverted. Since the accounting of the folio is done while adding the folio to the swap cache and the folio is not charged in the memcg yet, the accounting is then done at the node level, which is wrong. Fix this by charging the folio in the memcg before adding it to the swap cache. Link: https://lkml.kernel.org/r/20260320050601.1833108-1-alex@ghiti.fr Fixes: 2732acda82c9 ("mm, swap: use swap cache as the swap in synchronize layer") Signed-off-by: Alexandre Ghiti Acked-by: Kairui Song Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: Alexandre Ghiti Cc: Baoquan He Cc: Barry Song Cc: Kemeng Shi Cc: Signed-off-by: Andrew Morton --- mm/swap_state.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 6313b59d7eab..1415a5c54a43 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -472,6 +472,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, __folio_set_locked(folio); __folio_set_swapbacked(folio); + + if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) + goto failed; + for (;;) { ret = swap_cache_add_folio(folio, entry, &shadow); if (!ret) @@ -492,11 +496,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, goto failed; } - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - swap_cache_del_folio(folio); - goto failed; - } - memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); From eb1074ece72f3cb8874e995d94bb02245614487a Mon Sep 17 00:00:00 2001 From: Josh Law Date: Sat, 21 Mar 2026 10:54:24 -0700 Subject: [PATCH 302/369] mm/damon/sysfs: fix param_ctx leak on damon_sysfs_new_test_ctx() failure Patch series "mm/damon/sysfs: fix memory leak and NULL dereference issues", v4. DAMON_SYSFS can leak memory under allocation failure, and do NULL pointer dereference when a privileged user make wrong sequences of control. Fix those. This patch (of 3): When damon_sysfs_new_test_ctx() fails in damon_sysfs_commit_input(), param_ctx is leaked because the early return skips the cleanup at the out label. Destroy param_ctx before returning. Link: https://lkml.kernel.org/r/20260321175427.86000-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260321175427.86000-2-sj@kernel.org Fixes: f0c5118ebb0e ("mm/damon/sysfs: catch commit test ctx alloc failure") Signed-off-by: Josh Law Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: [6.18+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 576d1ddd736b..b573b9d60784 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1524,8 +1524,10 @@ static int damon_sysfs_commit_input(void *data) if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx); - if (!test_ctx) + if (!test_ctx) { + damon_destroy_ctx(param_ctx); return -ENOMEM; + } err = damon_commit_ctx(test_ctx, param_ctx); if (err) goto out; From a12479ed43b386cba36817ed1d5bc70801258740 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Sat, 21 Mar 2026 10:54:25 -0700 Subject: [PATCH 303/369] mm/damon/sysfs: check contexts->nr before accessing contexts_arr[0] Multiple sysfs command paths dereference contexts_arr[0] without first verifying that kdamond->contexts->nr == 1. A user can set nr_contexts to 0 via sysfs while DAMON is running, causing NULL pointer dereferences. In more detail, the issue can be triggered by privileged users like below. First, start DAMON and make contexts directory empty (kdamond->contexts->nr == 0). # damo start # cd /sys/kernel/mm/damon/admin/kdamonds/0 # echo 0 > contexts/nr_contexts Then, each of below commands will cause the NULL pointer dereference. # echo update_schemes_stats > state # echo update_schemes_tried_regions > state # echo update_schemes_tried_bytes > state # echo update_schemes_effective_quotas > state # echo update_tuned_intervals > state Guard all commands (except OFF) at the entry point of damon_sysfs_handle_cmd(). Link: https://lkml.kernel.org/r/20260321175427.86000-3-sj@kernel.org Fixes: 0ac32b8affb5 ("mm/damon/sysfs: support DAMOS stats") Signed-off-by: Josh Law Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: [5.18+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index b573b9d60784..ddc30586c0e6 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1749,6 +1749,9 @@ static int damon_sysfs_update_schemes_tried_regions( static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, struct damon_sysfs_kdamond *kdamond) { + if (cmd != DAMON_SYSFS_CMD_OFF && kdamond->contexts->nr != 1) + return -EINVAL; + switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); From d0bde8e2f3d2fb9aaec15d9c01da0a01526c7a56 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Sat, 21 Mar 2026 10:54:26 -0700 Subject: [PATCH 304/369] mm/damon/sysfs: check contexts->nr in repeat_call_fn damon_sysfs_repeat_call_fn() calls damon_sysfs_upd_tuned_intervals(), damon_sysfs_upd_schemes_stats(), and damon_sysfs_upd_schemes_effective_quotas() without checking contexts->nr. If nr_contexts is set to 0 via sysfs while DAMON is running, these functions dereference contexts_arr[0] and cause a NULL pointer dereference. Add the missing check. For example, the issue can be reproduced using DAMON sysfs interface and DAMON user-space tool (damo) [1] like below. $ sudo damo start --refresh_interval 1s $ echo 0 | sudo tee \ /sys/kernel/mm/damon/admin/kdamonds/0/contexts/nr_contexts Link: https://patch.msgid.link/20260320163559.178101-3-objecting@objecting.org Link: https://lkml.kernel.org/r/20260321175427.86000-4-sj@kernel.org Link: https://github.com/damonitor/damo [1] Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: Josh Law Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: [6.17+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index ddc30586c0e6..6a44a2f3d8fc 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1620,9 +1620,12 @@ static int damon_sysfs_repeat_call_fn(void *data) if (!mutex_trylock(&damon_sysfs_lock)) return 0; + if (sysfs_kdamond->contexts->nr != 1) + goto out; damon_sysfs_upd_tuned_intervals(sysfs_kdamond); damon_sysfs_upd_schemes_stats(sysfs_kdamond); damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond); +out: mutex_unlock(&damon_sysfs_lock); return 0; } From 26e7888a0c89e36332c1e897e4887f69e1e9c751 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 23 Mar 2026 21:20:18 +0100 Subject: [PATCH 305/369] mm/memory: fix PMD/PUD checks in follow_pfnmap_start() follow_pfnmap_start() suffers from two problems: (1) We are not re-fetching the pmd/pud after taking the PTL Therefore, we are not properly stabilizing what the lock actually protects. If there is concurrent zapping, we would indicate to the caller that we found an entry, however, that entry might already have been invalidated, or contain a different PFN after taking the lock. Properly use pmdp_get() / pudp_get() after taking the lock. (2) pmd_leaf() / pud_leaf() are not well defined on non-present entries pmd_leaf()/pud_leaf() could wrongly trigger on non-present entries. There is no real guarantee that pmd_leaf()/pud_leaf() returns something reasonable on non-present entries. Most architectures indeed either perform a present check or make it work by smart use of flags. However, for example loongarch checks the _PAGE_HUGE flag in pmd_leaf(), and always sets the _PAGE_HUGE flag in __swp_entry_to_pmd(). Whereby pmd_trans_huge() explicitly checks pmd_present(), pmd_leaf() does not do that. Let's check pmd_present()/pud_present() before assuming "the is a present PMD leaf" when spotting pmd_leaf()/pud_leaf(), like other page table handling code that traverses user page tables does. Given that non-present PMD entries are likely rare in VM_IO|VM_PFNMAP, (1) is likely more relevant than (2). It is questionable how often (1) would actually trigger, but let's CC stable to be sure. This was found by code inspection. Link: https://lkml.kernel.org/r/20260323-follow_pfnmap_fix-v1-1-5b0ec10872b3@kernel.org Fixes: 6da8e9634bb7 ("mm: new follow_pfnmap API") Signed-off-by: David Hildenbrand (Arm) Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Liam Howlett Cc: Michal Hocko Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6d54e5ec82f2..425e852a2eb7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6824,11 +6824,16 @@ retry: pudp = pud_offset(p4dp, address); pud = pudp_get(pudp); - if (pud_none(pud)) + if (!pud_present(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); - if (!unlikely(pud_leaf(pud))) { + pud = pudp_get(pudp); + + if (unlikely(!pud_present(pud))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pud_leaf(pud))) { spin_unlock(lock); goto retry; } @@ -6840,9 +6845,16 @@ retry: pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + goto out; if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); - if (!unlikely(pmd_leaf(pmd))) { + pmd = pmdp_get(pmdp); + + if (unlikely(!pmd_present(pmd))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } From 9b25a6e3d243a8ce14eeaf74082c621a9944c776 Mon Sep 17 00:00:00 2001 From: Max Boone Date: Wed, 25 Mar 2026 10:59:16 +0100 Subject: [PATCH 306/369] mm/pagewalk: fix race between concurrent split and refault The splitting of a PUD entry in walk_pud_range() can race with a concurrent thread refaulting the PUD leaf entry causing it to try walking a PMD range that has disappeared. An example and reproduction of this is to try reading numa_maps of a process while VFIO-PCI is setting up DMA (specifically the vfio_pin_pages_remote call) on a large BAR for that process. This will trigger a kernel BUG: vfio-pci 0000:03:00.0: enabling device (0000 -> 0002) BUG: unable to handle page fault for address: ffffa23980000000 PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI ... RIP: 0010:walk_pgd_range+0x3b5/0x7a0 Code: 8d 43 ff 48 89 44 24 28 4d 89 ce 4d 8d a7 00 00 20 00 48 8b 4c 24 28 49 81 e4 00 00 e0 ff 49 8d 44 24 ff 48 39 c8 4c 0f 43 e3 <49> f7 06 9f ff ff ff 75 3b 48 8b 44 24 20 48 8b 40 28 48 85 c0 74 RSP: 0018:ffffac23e1ecf808 EFLAGS: 00010287 RAX: 00007f44c01fffff RBX: 00007f4500000000 RCX: 00007f44ffffffff RDX: 0000000000000000 RSI: 000ffffffffff000 RDI: ffffffff93378fe0 RBP: ffffac23e1ecf918 R08: 0000000000000004 R09: ffffa23980000000 R10: 0000000000000020 R11: 0000000000000004 R12: 00007f44c0200000 R13: 00007f44c0000000 R14: ffffa23980000000 R15: 00007f44c0000000 FS: 00007fe884739580(0000) GS:ffff9b7d7a9c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffa23980000000 CR3: 000000c0650e2005 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: __walk_page_range+0x195/0x1b0 walk_page_vma+0x62/0xc0 show_numa_map+0x12b/0x3b0 seq_read_iter+0x297/0x440 seq_read+0x11d/0x140 vfs_read+0xc2/0x340 ksys_read+0x5f/0xe0 do_syscall_64+0x68/0x130 ? get_page_from_freelist+0x5c2/0x17e0 ? mas_store_prealloc+0x17e/0x360 ? vma_set_page_prot+0x4c/0xa0 ? __alloc_pages_noprof+0x14e/0x2d0 ? __mod_memcg_lruvec_state+0x8d/0x140 ? __lruvec_stat_mod_folio+0x76/0xb0 ? __folio_mod_stat+0x26/0x80 ? do_anonymous_page+0x705/0x900 ? __handle_mm_fault+0xa8d/0x1000 ? __count_memcg_events+0x53/0xf0 ? handle_mm_fault+0xa5/0x360 ? do_user_addr_fault+0x342/0x640 ? arch_exit_to_user_mode_prepare.constprop.0+0x16/0xa0 ? irqentry_exit_to_user_mode+0x24/0x100 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fe88464f47e Code: c0 e9 b6 fe ff ff 50 48 8d 3d be 07 0b 00 e8 69 01 02 00 66 0f 1f 84 00 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28 RSP: 002b:00007ffe6cd9a9b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fe88464f47e RDX: 0000000000020000 RSI: 00007fe884543000 RDI: 0000000000000003 RBP: 00007fe884543000 R08: 00007fe884542010 R09: 0000000000000000 R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 Fix this by validating the PUD entry in walk_pmd_range() using a stable snapshot (pudp_get()). If the PUD is not present or is a leaf, retry the walk via ACTION_AGAIN instead of descending further. This mirrors the retry logic in walk_pte_range(), which lets walk_pmd_range() retry if the PTE is not being got by pte_offset_map_lock(). Link: https://lkml.kernel.org/r/20260325-pagewalk-check-pmd-refault-v2-1-707bff33bc60@akamai.com Fixes: f9e54c3a2f5b ("vfio/pci: implement huge_fault support") Co-developed-by: David Hildenbrand (Arm) Signed-off-by: David Hildenbrand (Arm) Signed-off-by: Max Boone Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/pagewalk.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index cb358558807c..3ae2586ff45b 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -96,6 +96,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { + pud_t pudval = pudp_get(pud); pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; @@ -104,6 +105,24 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, int err = 0; int depth = real_depth(3); + /* + * For PTE handling, pte_offset_map_lock() takes care of checking + * whether there actually is a page table. But it also has to be + * very careful about concurrent page table reclaim. + * + * Similarly, we have to be careful here - a PUD entry that points + * to a PMD table cannot go away, so we can just walk it. But if + * it's something else, we need to ensure we didn't race something, + * so need to retry. + * + * A pertinent example of this is a PUD refault after PUD split - + * we will need to split again or risk accessing invalid memory. + */ + if (!pud_present(pudval) || pud_leaf(pudval)) { + walk->action = ACTION_AGAIN; + return 0; + } + pmd = pmd_offset(pud, addr); do { again: @@ -217,12 +236,12 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, else if (pud_leaf(*pud) || !pud_present(*pud)) continue; /* Nothing to do. */ - if (pud_none(*pud)) - goto again; - err = walk_pmd_range(pud, addr, next, walk); if (err) break; + + if (walk->action == ACTION_AGAIN) + goto again; } while (pud++, addr = next, addr != end); return err; From d2fd4225d8de3b9dccf46a7a021869c65152a044 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Mar 2026 21:55:16 +0100 Subject: [PATCH 307/369] bug: avoid format attribute warning for clang as well Like gcc, clang-22 now also warns about a function that it incorrectly identifies as a printf-style format: lib/bug.c:190:22: error: diagnostic behavior may be improved by adding the 'format(printf, 1, 0)' attribute to the declaration of '__warn_printf' [-Werror,-Wmissing-format-attribute] 179 | static void __warn_printf(const char *fmt, struct pt_regs *regs) | __attribute__((format(printf, 1, 0))) 180 | { 181 | if (!fmt) 182 | return; 183 | 184 | #ifdef HAVE_ARCH_BUG_FORMAT_ARGS 185 | if (regs) { 186 | struct arch_va_list _args; 187 | va_list *args = __warn_args(&_args, regs); 188 | 189 | if (args) { 190 | vprintk(fmt, *args); | ^ Revert the change that added a gcc-specific workaround, and instead add the generic annotation that avoid the warning. Link: https://lkml.kernel.org/r/20260323205534.1284284-1-arnd@kernel.org Fixes: d36067d6ea00 ("bug: Hush suggest-attribute=format for __warn_printf()") Suggested-by: Andy Shevchenko Suggested-by: Brendan Jackman Link: https://lore.kernel.org/all/20251208141618.2805983-1-andriy.shevchenko@linux.intel.com/T/#u Signed-off-by: Arnd Bergmann Reviewed-by: Brendan Jackman Reviewed-by: Andy Shevchenko Cc: Bill Wendling Cc: Ingo Molnar Cc: Justin Stitt Cc: Nathan Chancellor Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/bug.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/bug.c b/lib/bug.c index 623c467a8b76..aab9e6a40c5f 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -173,10 +173,8 @@ struct bug_entry *find_bug(unsigned long bugaddr) return module_find_bug(bugaddr); } -__diag_push(); -__diag_ignore(GCC, all, "-Wsuggest-attribute=format", - "Not a valid __printf() conversion candidate."); -static void __warn_printf(const char *fmt, struct pt_regs *regs) +static __printf(1, 0) +void __warn_printf(const char *fmt, struct pt_regs *regs) { if (!fmt) return; @@ -195,7 +193,6 @@ static void __warn_printf(const char *fmt, struct pt_regs *regs) printk("%s", fmt); } -__diag_pop(); static enum bug_trap_type __report_bug(struct bug_entry *bug, unsigned long bugaddr, struct pt_regs *regs) { From 5ac9c7c2efd0d3c0c2d3bc6e9cd900d3ab6af27a Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 27 Mar 2026 17:31:04 +0000 Subject: [PATCH 308/369] mm/mseal: update VMA end correctly on merge Previously we stored the end of the current VMA in curr_end, and then upon iterating to the next VMA updated curr_start to curr_end to advance to the next VMA. However, this doesn't take into account the fact that a VMA might be updated due to a merge by vma_modify_flags(), which can result in curr_end being stale and thus, upon setting curr_start to curr_end, ending up with an incorrect curr_start on the next iteration. Resolve the issue by setting curr_end to vma->vm_end unconditionally to ensure this value remains updated should this occur. While we're here, eliminate this entire class of bug by simply setting const curr_[start/end] to be clamped to the input range and VMAs, which also happens to simplify the logic. Link: https://lkml.kernel.org/r/20260327173104.322405-1-ljs@kernel.org Fixes: 6c2da14ae1e0 ("mm/mseal: rework mseal apply logic") Signed-off-by: Lorenzo Stoakes (Oracle) Reported-by: Antonius Closes: https://lore.kernel.org/linux-mm/CAK8a0jwWGj9-SgFk0yKFh7i8jMkwKm5b0ao9=kmXWjO54veX2g@mail.gmail.com/ Suggested-by: David Hildenbrand (ARM) Acked-by: Vlastimil Babka (SUSE) Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand (Arm) Cc: Jann Horn Cc: Jeff Xu Cc: Liam Howlett Cc: Signed-off-by: Andrew Morton --- mm/mseal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/mseal.c b/mm/mseal.c index 316b5e1dec78..ac58643181f7 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -56,7 +56,6 @@ static int mseal_apply(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *vma, *prev; - unsigned long curr_start = start; VMA_ITERATOR(vmi, mm, start); /* We know there are no gaps so this will be non-NULL. */ @@ -66,6 +65,7 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { + const unsigned long curr_start = MAX(vma->vm_start, start); const unsigned long curr_end = MIN(vma->vm_end, end); if (!(vma->vm_flags & VM_SEALED)) { @@ -79,7 +79,6 @@ static int mseal_apply(struct mm_struct *mm, } prev = vma; - curr_start = curr_end; } return 0; From 6bc0987d0b508b3768808efafa1e90041713526b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:18 +0000 Subject: [PATCH 309/369] mm/vma: add vma_flags_empty(), vma_flags_and(), vma_flags_diff_pair() Patch series "mm/vma: convert vm_flags_t to vma_flags_t in vma code", v4. This series converts a lot of the existing use of the legacy vm_flags_t data type to the new vma_flags_t type which replaces it. In order to do so it adds a number of additional helpers: * vma_flags_empty() - Determines whether a vma_flags_t value has no bits set. * vma_flags_and() - Performs a bitwise AND between two vma_flags_t values. * vma_flags_diff_pair() - Determines which flags are not shared between a pair of VMA flags (typically non-constant values) * append_vma_flags() - Similar to mk_vma_flags(), but allows a vma_flags_t value to be specified (typically a constant value) which will be copied and appended to to create a new vma_flags_t value, with additional flags specified to append to it. * vma_flags_same() - Determines if a vma_flags_t value is exactly equal to a set of VMA flags. * vma_flags_same_mask() - Determines if a vma_flags_t value is eactly equal to another vma_flags_t value (typically constant). * vma_flags_same_pair() - Determines if a pair of vma_flags_t values are exactly equal to one another (typically both non-constant). * vma_flags_to_legacy() - Converts a vma_flags_t value to a vm_flags_t value, used to enable more iterative introduction of the use of vma_flags_t. * legacy_to_vma_flags() - Converts a vm_flags_t value to a vma_flags-t value, for the same purpose. * vma_flags_test_single_mask() - Tests whether a vma_flags_t value contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_test() - Tests whether a VMA's flags contain a specific singular VMA flag. * vma_test_any() - Tests whether a VMA's flags contain any of a set of VMA flags. * vma_test_any_mask() - Tests whether a VMA's flags contain any of the flags specified in another, typically constant, vma_flags_t value. * vma_test_single_mask() - Tests whether a VMA's flags contain the single flag specified in an input vma_flags_t flag mask, or if that flag mask is empty, is defined to return false. Useful for config-predicated VMA flag mask defines. * vma_clear_flags() - Clears a specific set of VMA flags from a vma_flags_t value. * vma_clear_flags_mask() - Clears those flag set in a vma_flags_t value (typically constant) from a (typically not constant) vma_flags_t value. The series mostly focuses on the the VMA specific code, especially that contained in mm/vma.c and mm/vma.h. It updates both brk() and mmap() logic to utils vma_flags_t values as much as is practiaclly possible at this point, changing surrounding logic to be able to do so. It also updates the vma_modify_xxx() functions where they interact with VMA flags directly to use vm_flags_t values where possible. There is extensive testing added in the VMA userland tests to assert that all of these new VMA flag functions work correctly. This patch (of 25): Firstly, add the ability to determine if VMA flags are empty, that is no flags are set in a vma_flags_t value. Next, add the ability to obtain the equivalent of the bitwise and of two vma_flags_t values, via vma_flags_and_mask(). Next, add the ability to obtain the difference between two sets of VMA flags, that is the equivalent to the exclusive bitwise OR of the two sets of flags, via vma_flags_diff_pair(). vma_flags_xxx_mask() typically operates on a pointer to a vma_flags_t value, which is assumed to be an lvalue of some kind (such as a field in a struct or a stack variable) and an rvalue of some kind (typically a constant set of VMA flags obtained e.g. via mk_vma_flags() or equivalent). However vma_flags_diff_pair() is intended to operate on two lvalues, so use the _pair() suffix to make this clear. Finally, update VMA userland tests to add these helpers. We also port bitmap_xor() and __bitmap_xor() to the tools/ headers and source to allow the tests to work with vma_flags_diff_pair(). Link: https://lkml.kernel.org/r/cover.1774034900.git.ljs@kernel.org Link: https://lkml.kernel.org/r/53ab55b7da91425775e42c03177498ad6de88ef4.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 62 +++++++++++++++++++++++++++------ include/linux/mm_types.h | 8 +++++ tools/include/linux/bitmap.h | 13 +++++++ tools/lib/bitmap.c | 10 ++++++ tools/testing/vma/include/dup.h | 36 ++++++++++++++++++- 5 files changed, 118 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 16a1ad9a3397..7954a7a2b811 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1048,6 +1048,19 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, return flags; } +/* + * Helper macro which bitwise-or combines the specified input flags into a + * vma_flags_t bitmap value. E.g.: + * + * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + * + * The compiler cleverly optimises away all of the work and this ends up being + * equivalent to aggregating the values manually. + */ +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1062,17 +1075,30 @@ static __always_inline bool vma_flags_test(const vma_flags_t *flags, } /* - * Helper macro which bitwise-or combines the specified input flags into a - * vma_flags_t bitmap value. E.g.: - * - * vma_flags_t flags = mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, - * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); - * - * The compiler cleverly optimises away all of the work and this ends up being - * equivalent to aggregating the values manually. + * Obtain a set of VMA flags which contain the overlapping flags contained + * within flags and to_and. */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, + vma_flags_t to_and) +{ + vma_flags_t dst; + unsigned long *bitmap_dst = dst.__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_and = to_and.__vma_flags; + + bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); + return dst; +} + +/* + * Obtain a set of VMA flags which contains the specified overlapping flags, + * e.g.: + * + * vma_flags_t read_flags = vma_flags_and(&flags, VMA_READ_BIT, + * VMA_MAY_READ_BIT); + */ +#define vma_flags_and(flags, ...) \ + vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) /* Test each of to_test flags in flags, non-atomically. */ static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, @@ -1146,6 +1172,22 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Obtain a VMA flags value containing those flags that are present in flags or + * flags_other but not in both. + */ +static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + vma_flags_t dst; + const unsigned long *bitmap_other = flags_other->__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + unsigned long *bitmap_dst = dst.__vma_flags; + + bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); + return dst; +} + /* * Helper to test that ALL specified flags are set in a VMA. * diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f22aecb047b7..321aa150c1ee 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -870,6 +870,14 @@ typedef struct { #define EMPTY_VMA_FLAGS ((vma_flags_t){ }) +/* Are no flags set in the specified VMA flags? */ +static __always_inline bool vma_flags_empty(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_empty(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 250883090a5d..845eda759f67 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -28,6 +28,8 @@ bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) @@ -209,4 +211,15 @@ static inline void bitmap_clear(unsigned long *map, unsigned int start, else __bitmap_clear(map, start, nbits); } + +static __always_inline +void bitmap_xor(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + *dst = *src1 ^ *src2; + else + __bitmap_xor(dst, src1, src2, nbits); +} + #endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index aa83d22c45e3..fedc9070f0e4 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -169,3 +169,13 @@ bool __bitmap_subset(const unsigned long *bitmap1, return false; return true; } + +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] ^ bitmap2[k]; +} diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 8865ffe046d8..8091a5caaeb8 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -422,6 +422,13 @@ struct vma_iterator { #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +static __always_inline bool vma_flags_empty(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_empty(bitmap, NUM_VMA_FLAG_BITS); +} + /* What action should be taken after an .mmap_prepare call is complete? */ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ @@ -855,6 +862,21 @@ static __always_inline bool vma_flags_test(const vma_flags_t *flags, return test_bit((__force int)bit, bitmap); } +static __always_inline vma_flags_t vma_flags_and_mask(const vma_flags_t *flags, + vma_flags_t to_and) +{ + vma_flags_t dst; + unsigned long *bitmap_dst = dst.__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_to_and = to_and.__vma_flags; + + bitmap_and(bitmap_dst, bitmap, bitmap_to_and, NUM_VMA_FLAG_BITS); + return dst; +} + +#define vma_flags_and(flags, ...) \ + vma_flags_and_mask(flags, mk_vma_flags(__VA_ARGS__)) + static __always_inline bool vma_flags_test_any_mask(const vma_flags_t *flags, vma_flags_t to_test) { @@ -901,8 +923,20 @@ static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t #define vma_flags_clear(flags, ...) \ vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + vma_flags_t dst; + const unsigned long *bitmap_other = flags_other->__vma_flags; + const unsigned long *bitmap = flags->__vma_flags; + unsigned long *bitmap_dst = dst.__vma_flags; + + bitmap_xor(bitmap_dst, bitmap, bitmap_other, NUM_VMA_FLAG_BITS); + return dst; +} + static inline bool vma_test_all_mask(const struct vm_area_struct *vma, - vma_flags_t flags) + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } From e4fd34b84b564105d478a8613b4fdc1c702c3607 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:19 +0000 Subject: [PATCH 310/369] tools/testing/vma: add unit tests flag empty, diff_pair, and[_mask] Add VMA unit tests to assert that: * vma_flags_empty() * vma_flags_diff_pair() * vma_flags_and_mask() * vma_flags_and() All function as expected. In additional to the added tests, in order to make testing easier, add vma_flags_same_mask() and vma_flags_same() for testing only. If/when these are required in kernel code, they can be moved over. Also add ASSERT_FLAGS_[NOT_]SAME[_MASK](), ASSERT_FLAGS_[NON]EMPTY() test helpers to make asserting flag state easier and more convenient. Link: https://lkml.kernel.org/r/471ce7ceb1d32e5fc9c0660966b9eacdf899b4d1.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/include/custom.h | 12 +++ tools/testing/vma/shared.h | 18 ++++ tools/testing/vma/tests/vma.c | 137 +++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 6c62a38a2f6f..578045caf5ca 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -120,3 +120,15 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } + +/* Place here until needed in the kernel code. */ +static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) diff --git a/tools/testing/vma/shared.h b/tools/testing/vma/shared.h index 6c64211cfa22..e2e5d6ef6bdd 100644 --- a/tools/testing/vma/shared.h +++ b/tools/testing/vma/shared.h @@ -35,6 +35,24 @@ #define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) #define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) +#define ASSERT_FLAGS_SAME_MASK(_flags, _flags_other) \ + ASSERT_TRUE(vma_flags_same_mask((_flags), (_flags_other))) + +#define ASSERT_FLAGS_NOT_SAME_MASK(_flags, _flags_other) \ + ASSERT_FALSE(vma_flags_same_mask((_flags), (_flags_other))) + +#define ASSERT_FLAGS_SAME(_flags, ...) \ + ASSERT_TRUE(vma_flags_same(_flags, __VA_ARGS__)) + +#define ASSERT_FLAGS_NOT_SAME(_flags, ...) \ + ASSERT_FALSE(vma_flags_same(_flags, __VA_ARGS__)) + +#define ASSERT_FLAGS_EMPTY(_flags) \ + ASSERT_TRUE(vma_flags_empty(_flags)) + +#define ASSERT_FLAGS_NONEMPTY(_flags) \ + ASSERT_FALSE(vma_flags_empty(_flags)) + #define IS_SET(_val, _flags) ((_val & _flags) == _flags) extern bool fail_prealloc; diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index f6edd44f4e9e..4a7b11a8a285 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -363,6 +363,140 @@ static bool test_vma_flags_clear(void) return true; } +/* Ensure that vma_flags_empty() works correctly. */ +static bool test_vma_flags_empty(void) +{ + vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + + ASSERT_FLAGS_NONEMPTY(&flags); + vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 + ASSERT_FLAGS_NONEMPTY(&flags); + vma_flags_clear(&flags, 64, 65); + ASSERT_FLAGS_EMPTY(&flags); +#else + ASSERT_FLAGS_EMPTY(&flags); +#endif + + return true; +} + +/* Ensure that vma_flags_diff_pair() works correctly. */ +static bool test_vma_flags_diff(void) +{ + vma_flags_t flags1 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + vma_flags_t flags2 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, VMA_MAYWRITE_BIT, + VMA_MAYEXEC_BIT, 64, 65, 66, 67); + vma_flags_t diff = vma_flags_diff_pair(&flags1, &flags2); + +#if NUM_VMA_FLAG_BITS > 64 + ASSERT_FLAGS_SAME(&diff, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT, 66, 67); +#else + ASSERT_FLAGS_SAME(&diff, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT); +#endif + /* Should be the same even if re-ordered. */ + diff = vma_flags_diff_pair(&flags2, &flags1); +#if NUM_VMA_FLAG_BITS > 64 + ASSERT_FLAGS_SAME(&diff, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT, 66, 67); +#else + ASSERT_FLAGS_SAME(&diff, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT); +#endif + + /* Should be no difference when applied against themselves. */ + diff = vma_flags_diff_pair(&flags1, &flags1); + ASSERT_FLAGS_EMPTY(&diff); + diff = vma_flags_diff_pair(&flags2, &flags2); + ASSERT_FLAGS_EMPTY(&diff); + + /* One set of flags against an empty one should equal the original. */ + flags2 = EMPTY_VMA_FLAGS; + diff = vma_flags_diff_pair(&flags1, &flags2); + ASSERT_FLAGS_SAME_MASK(&diff, flags1); + + /* A subset should work too. */ + flags2 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT); + diff = vma_flags_diff_pair(&flags1, &flags2); +#if NUM_VMA_FLAG_BITS > 64 + ASSERT_FLAGS_SAME(&diff, VMA_EXEC_BIT, 64, 65); +#else + ASSERT_FLAGS_SAME(&diff, VMA_EXEC_BIT); +#endif + + return true; +} + +/* Ensure that vma_flags_and() and friends work correctly. */ +static bool test_vma_flags_and(void) +{ + vma_flags_t flags1 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, 64, 65); + vma_flags_t flags2 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT, VMA_MAYWRITE_BIT, + VMA_MAYEXEC_BIT, 64, 65, 66, 67); + vma_flags_t flags3 = mk_vma_flags(VMA_IO_BIT, VMA_MAYBE_GUARD_BIT, + 68, 69); + vma_flags_t and = vma_flags_and_mask(&flags1, flags2); + +#if NUM_VMA_FLAG_BITS > 64 + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, + 64, 65); +#else + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); +#endif + + and = vma_flags_and_mask(&flags1, flags1); + ASSERT_FLAGS_SAME_MASK(&and, flags1); + + and = vma_flags_and_mask(&flags2, flags2); + ASSERT_FLAGS_SAME_MASK(&and, flags2); + + and = vma_flags_and_mask(&flags1, flags3); + ASSERT_FLAGS_EMPTY(&and); + and = vma_flags_and_mask(&flags2, flags3); + ASSERT_FLAGS_EMPTY(&and); + + and = vma_flags_and(&flags1, VMA_READ_BIT); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT); + + and = vma_flags_and(&flags1, VMA_READ_BIT, VMA_WRITE_BIT); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT); + + and = vma_flags_and(&flags1, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + +#if NUM_VMA_FLAG_BITS > 64 + and = vma_flags_and(&flags1, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, + 64); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64); + + and = vma_flags_and(&flags1, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, + 64, 65); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, + 65); +#endif + + /* And against some missing values. */ + + and = vma_flags_and(&flags1, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, + VMA_IO_BIT); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + + and = vma_flags_and(&flags1, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, + VMA_IO_BIT, VMA_RAND_READ_BIT); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); + +#if NUM_VMA_FLAG_BITS > 64 + and = vma_flags_and(&flags1, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, + VMA_IO_BIT, VMA_RAND_READ_BIT, 69); + ASSERT_FLAGS_SAME(&and, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); +#endif + + return true; +} + static void run_vma_tests(int *num_tests, int *num_fail) { TEST(copy_vma); @@ -372,4 +506,7 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(vma_flags_test); TEST(vma_flags_test_any); TEST(vma_flags_clear); + TEST(vma_flags_empty); + TEST(vma_flags_diff); + TEST(vma_flags_and); } From 8228e42b5f88aa68708ced277399ee3b59748627 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:20 +0000 Subject: [PATCH 311/369] mm/vma: add further vma_flags_t unions In order to utilise the new vma_flags_t type, we currently place it in union with legacy vm_flags fields of type vm_flags_t to make the transition smoother. Add vma_flags_t union entries for mm->def_flags and vmg->vm_flags - mm->def_vma_flags and vmg->vma_flags respectively. Once the conversion is complete, these will be replaced with vma_flags_t entries alone. Also update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/d507d542c089ba132e9da53f2ff7f80ca117c3b4.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++++- mm/vma.h | 6 +++++- tools/testing/vma/include/dup.h | 5 ++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 321aa150c1ee..8ef84849953f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1249,7 +1249,11 @@ struct mm_struct { unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ - vm_flags_t def_flags; + union { + /* Temporary while VMA flags are being converted. */ + vm_flags_t def_flags; + vma_flags_t def_vma_flags; + }; /** * @write_protect_seq: Locked when any thread is write diff --git a/mm/vma.h b/mm/vma.h index eba388c61ef4..cf8926558bf6 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -98,7 +98,11 @@ struct vma_merge_struct { unsigned long end; pgoff_t pgoff; - vm_flags_t vm_flags; + union { + /* Temporary while VMA flags are being converted. */ + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 8091a5caaeb8..58e063b1ee27 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -33,7 +33,10 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ - unsigned long def_flags; + union { + vm_flags_t def_flags; + vma_flags_t def_vma_flags; + }; mm_flags_t flags; /* Must use mm_flags_* helpers to access */ }; From bd44d91d0ccc6da5ed91844b89a23a7df4938548 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:21 +0000 Subject: [PATCH 312/369] tools/testing/vma: convert bulk of test code to vma_flags_t Convert the test code to utilise vma_flags_t as opposed to the deprecate vm_flags_t as much as possible. As part of this change, add VMA_STICKY_FLAGS and VMA_SPECIAL_FLAGS as early versions of what these defines will look like in the kernel logic once this logic is implemented. Link: https://lkml.kernel.org/r/df90efe29300bd899989f695be4ae3adc901a828.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/include/custom.h | 7 + tools/testing/vma/include/dup.h | 7 +- tools/testing/vma/shared.c | 8 +- tools/testing/vma/shared.h | 4 +- tools/testing/vma/tests/merge.c | 313 +++++++++++++++-------------- tools/testing/vma/tests/vma.c | 10 +- 6 files changed, 186 insertions(+), 163 deletions(-) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 578045caf5ca..6200f938e586 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -132,3 +132,10 @@ static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, } #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 58e063b1ee27..1dee78c34872 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -507,10 +507,7 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - union { - vm_flags_t vm_flags; - vma_flags_t vma_flags; - }; + vma_flags_t vma_flags; pgprot_t page_prot; /* Write-only fields. */ @@ -1146,7 +1143,7 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op, .pgoff = vma->vm_pgoff, .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, + .vma_flags = vma->flags, .page_prot = vma->vm_page_prot, .action.type = MMAP_NOTHING, /* Default */ diff --git a/tools/testing/vma/shared.c b/tools/testing/vma/shared.c index bda578cc3304..2565a5aecb80 100644 --- a/tools/testing/vma/shared.c +++ b/tools/testing/vma/shared.c @@ -14,7 +14,7 @@ struct task_struct __current; struct vm_area_struct *alloc_vma(struct mm_struct *mm, unsigned long start, unsigned long end, - pgoff_t pgoff, vm_flags_t vm_flags) + pgoff_t pgoff, vma_flags_t vma_flags) { struct vm_area_struct *vma = vm_area_alloc(mm); @@ -24,7 +24,7 @@ struct vm_area_struct *alloc_vma(struct mm_struct *mm, vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; - vm_flags_reset(vma, vm_flags); + vma->flags = vma_flags; vma_assert_detached(vma); return vma; @@ -38,9 +38,9 @@ void detach_free_vma(struct vm_area_struct *vma) struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, unsigned long start, unsigned long end, - pgoff_t pgoff, vm_flags_t vm_flags) + pgoff_t pgoff, vma_flags_t vma_flags) { - struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags); + struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vma_flags); if (vma == NULL) return NULL; diff --git a/tools/testing/vma/shared.h b/tools/testing/vma/shared.h index e2e5d6ef6bdd..8b9e3b11c3cb 100644 --- a/tools/testing/vma/shared.h +++ b/tools/testing/vma/shared.h @@ -94,7 +94,7 @@ static inline void dummy_close(struct vm_area_struct *) /* Helper function to simply allocate a VMA. */ struct vm_area_struct *alloc_vma(struct mm_struct *mm, unsigned long start, unsigned long end, - pgoff_t pgoff, vm_flags_t vm_flags); + pgoff_t pgoff, vma_flags_t vma_flags); /* Helper function to detach and free a VMA. */ void detach_free_vma(struct vm_area_struct *vma); @@ -102,7 +102,7 @@ void detach_free_vma(struct vm_area_struct *vma); /* Helper function to allocate a VMA and link it to the tree. */ struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, unsigned long start, unsigned long end, - pgoff_t pgoff, vm_flags_t vm_flags); + pgoff_t pgoff, vma_flags_t vma_flags); /* * Helper function to reset the dummy anon_vma to indicate it has not been diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c index 3708dc6945b0..d3e725dc0000 100644 --- a/tools/testing/vma/tests/merge.c +++ b/tools/testing/vma/tests/merge.c @@ -33,7 +33,7 @@ static int expand_existing(struct vma_merge_struct *vmg) * specified new range. */ void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags) + unsigned long end, pgoff_t pgoff, vma_flags_t vma_flags) { vma_iter_set(vmg->vmi, start); @@ -45,7 +45,7 @@ void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, vmg->start = start; vmg->end = end; vmg->pgoff = pgoff; - vmg->vm_flags = vm_flags; + vmg->vma_flags = vma_flags; vmg->just_expand = false; vmg->__remove_middle = false; @@ -56,10 +56,10 @@ void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, /* Helper function to set both the VMG range and its anon_vma. */ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, + unsigned long end, pgoff_t pgoff, vma_flags_t vma_flags, struct anon_vma *anon_vma) { - vmg_set_range(vmg, start, end, pgoff, vm_flags); + vmg_set_range(vmg, start, end, pgoff, vma_flags); vmg->anon_vma = anon_vma; } @@ -71,12 +71,12 @@ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long s */ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, + unsigned long end, pgoff_t pgoff, vma_flags_t vma_flags, bool *was_merged) { struct vm_area_struct *merged; - vmg_set_range(vmg, start, end, pgoff, vm_flags); + vmg_set_range(vmg, start, end, pgoff, vma_flags); merged = merge_new(vmg); if (merged) { @@ -89,23 +89,24 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE); - return alloc_and_link_vma(mm, start, end, pgoff, vm_flags); + return alloc_and_link_vma(mm, start, end, pgoff, vma_flags); } static bool test_simple_merge(void) { struct vm_area_struct *vma; - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, + VMA_MAYWRITE_BIT); struct mm_struct mm = {}; - struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, vm_flags); - struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, vm_flags); + struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, vma_flags); + struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, vma_flags); VMA_ITERATOR(vmi, &mm, 0x1000); struct vma_merge_struct vmg = { .mm = &mm, .vmi = &vmi, .start = 0x1000, .end = 0x2000, - .vm_flags = vm_flags, + .vma_flags = vma_flags, .pgoff = 1, }; @@ -118,7 +119,7 @@ static bool test_simple_merge(void) ASSERT_EQ(vma->vm_start, 0); ASSERT_EQ(vma->vm_end, 0x3000); ASSERT_EQ(vma->vm_pgoff, 0); - ASSERT_EQ(vma->vm_flags, vm_flags); + ASSERT_FLAGS_SAME_MASK(&vma->flags, vma_flags); detach_free_vma(vma); mtree_destroy(&mm.mm_mt); @@ -129,11 +130,12 @@ static bool test_simple_merge(void) static bool test_simple_modify(void) { struct vm_area_struct *vma; - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, + VMA_MAYWRITE_BIT); + vm_flags_t legacy_flags = VM_READ | VM_WRITE; struct mm_struct mm = {}; - struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); + struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vma_flags); VMA_ITERATOR(vmi, &mm, 0x1000); - vm_flags_t flags = VM_READ | VM_MAYREAD; ASSERT_FALSE(attach_vma(&mm, init_vma)); @@ -142,7 +144,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, &flags); + 0x1000, 0x2000, &legacy_flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); @@ -189,9 +191,10 @@ static bool test_simple_modify(void) static bool test_simple_expand(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, + VMA_MAYWRITE_BIT); struct mm_struct mm = {}; - struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, vm_flags); + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, vma_flags); VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { .vmi = &vmi, @@ -217,9 +220,10 @@ static bool test_simple_expand(void) static bool test_simple_shrink(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, + VMA_MAYWRITE_BIT); struct mm_struct mm = {}; - struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, vma_flags); VMA_ITERATOR(vmi, &mm, 0); ASSERT_FALSE(attach_vma(&mm, vma)); @@ -238,7 +242,8 @@ static bool test_simple_shrink(void) static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool c_is_sticky) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -265,31 +270,31 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool merged; if (is_sticky) - vm_flags |= VM_STICKY; + vma_flags_set_mask(&vma_flags, VMA_STICKY_FLAGS); /* * 0123456789abc * AA B CC */ - vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); + vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vma_flags); ASSERT_NE(vma_a, NULL); if (a_is_sticky) - vm_flags_set(vma_a, VM_STICKY); + vma_flags_set_mask(&vma_a->flags, VMA_STICKY_FLAGS); /* We give each VMA a single avc so we can test anon_vma duplication. */ INIT_LIST_HEAD(&vma_a->anon_vma_chain); list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); - vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); + vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vma_flags); ASSERT_NE(vma_b, NULL); if (b_is_sticky) - vm_flags_set(vma_b, VM_STICKY); + vma_flags_set_mask(&vma_b->flags, VMA_STICKY_FLAGS); INIT_LIST_HEAD(&vma_b->anon_vma_chain); list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); - vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags); + vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vma_flags); ASSERT_NE(vma_c, NULL); if (c_is_sticky) - vm_flags_set(vma_c, VM_STICKY); + vma_flags_set_mask(&vma_c->flags, VMA_STICKY_FLAGS); INIT_LIST_HEAD(&vma_c->anon_vma_chain); list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); @@ -299,7 +304,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, * 0123456789abc * AA B ** CC */ - vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, vm_flags, &merged); + vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, vma_flags, &merged); ASSERT_NE(vma_d, NULL); INIT_LIST_HEAD(&vma_d->anon_vma_chain); list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain); @@ -314,7 +319,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, */ vma_a->vm_ops = &vm_ops; /* This should have no impact. */ vma_b->anon_vma = &dummy_anon_vma; - vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, vm_flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, vma_flags, &merged); ASSERT_EQ(vma, vma_a); /* Merge with A, delete B. */ ASSERT_TRUE(merged); @@ -325,7 +330,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); if (is_sticky || a_is_sticky || b_is_sticky) - ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma->flags, VMA_STICKY_FLAGS)); /* * Merge to PREVIOUS VMA. @@ -333,7 +338,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, * 0123456789abc * AAAA* DD CC */ - vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, vm_flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, vma_flags, &merged); ASSERT_EQ(vma, vma_a); /* Extend A. */ ASSERT_TRUE(merged); @@ -344,7 +349,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); if (is_sticky || a_is_sticky) - ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma->flags, VMA_STICKY_FLAGS)); /* * Merge to NEXT VMA. @@ -354,7 +359,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, */ vma_d->anon_vma = &dummy_anon_vma; vma_d->vm_ops = &vm_ops; /* This should have no impact. */ - vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, vm_flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, vma_flags, &merged); ASSERT_EQ(vma, vma_d); /* Prepend. */ ASSERT_TRUE(merged); @@ -365,7 +370,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); if (is_sticky) /* D uses is_sticky. */ - ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma->flags, VMA_STICKY_FLAGS)); /* * Merge BOTH sides. @@ -374,7 +379,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, * AAAAA*DDD CC */ vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */ - vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, vm_flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, vma_flags, &merged); ASSERT_EQ(vma, vma_a); /* Merge with A, delete D. */ ASSERT_TRUE(merged); @@ -385,7 +390,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); if (is_sticky || a_is_sticky) - ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma->flags, VMA_STICKY_FLAGS)); /* * Merge to NEXT VMA. @@ -394,7 +399,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, * AAAAAAAAA *CC */ vma_c->anon_vma = &dummy_anon_vma; - vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, vm_flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, vma_flags, &merged); ASSERT_EQ(vma, vma_c); /* Prepend C. */ ASSERT_TRUE(merged); @@ -405,7 +410,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); if (is_sticky || c_is_sticky) - ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma->flags, VMA_STICKY_FLAGS)); /* * Merge BOTH sides. @@ -413,7 +418,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, * 0123456789abc * AAAAAAAAA*CCC */ - vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, vm_flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, vma_flags, &merged); ASSERT_EQ(vma, vma_a); /* Extend A and delete C. */ ASSERT_TRUE(merged); @@ -424,7 +429,7 @@ static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 1); if (is_sticky || a_is_sticky || c_is_sticky) - ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma->flags, VMA_STICKY_FLAGS)); /* * Final state. @@ -469,29 +474,30 @@ static bool test_merge_new(void) static bool test_vma_merge_special_flags(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { .mm = &mm, .vmi = &vmi, }; - vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP }; - vm_flags_t all_special_flags = 0; + vma_flag_t special_flags[] = { VMA_IO_BIT, VMA_DONTEXPAND_BIT, + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT }; + vma_flags_t all_special_flags = EMPTY_VMA_FLAGS; int i; struct vm_area_struct *vma_left, *vma; /* Make sure there aren't new VM_SPECIAL flags. */ - for (i = 0; i < ARRAY_SIZE(special_flags); i++) { - all_special_flags |= special_flags[i]; - } - ASSERT_EQ(all_special_flags, VM_SPECIAL); + for (i = 0; i < ARRAY_SIZE(special_flags); i++) + vma_flags_set(&all_special_flags, special_flags[i]); + ASSERT_FLAGS_SAME_MASK(&all_special_flags, VMA_SPECIAL_FLAGS); /* * 01234 * AAA */ - vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); ASSERT_NE(vma_left, NULL); /* 1. Set up new VMA with special flag that would otherwise merge. */ @@ -502,12 +508,14 @@ static bool test_vma_merge_special_flags(void) * * This should merge if not for the VM_SPECIAL flag. */ - vmg_set_range(&vmg, 0x3000, 0x4000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x4000, 3, vma_flags); for (i = 0; i < ARRAY_SIZE(special_flags); i++) { - vm_flags_t special_flag = special_flags[i]; + vma_flag_t special_flag = special_flags[i]; + vma_flags_t flags = vma_flags; - vm_flags_reset(vma_left, vm_flags | special_flag); - vmg.vm_flags = vm_flags | special_flag; + vma_flags_set(&flags, special_flag); + vma_left->flags = flags; + vmg.vma_flags = flags; vma = merge_new(&vmg); ASSERT_EQ(vma, NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); @@ -521,15 +529,17 @@ static bool test_vma_merge_special_flags(void) * * Create a VMA to modify. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vma_flags); ASSERT_NE(vma, NULL); vmg.middle = vma; for (i = 0; i < ARRAY_SIZE(special_flags); i++) { - vm_flags_t special_flag = special_flags[i]; + vma_flag_t special_flag = special_flags[i]; + vma_flags_t flags = vma_flags; - vm_flags_reset(vma_left, vm_flags | special_flag); - vmg.vm_flags = vm_flags | special_flag; + vma_flags_set(&flags, special_flag); + vma_left->flags = flags; + vmg.vma_flags = flags; vma = merge_existing(&vmg); ASSERT_EQ(vma, NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); @@ -541,7 +551,8 @@ static bool test_vma_merge_special_flags(void) static bool test_vma_merge_with_close(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -621,11 +632,11 @@ static bool test_vma_merge_with_close(void) * PPPPPPNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vma_flags); vma_next->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); ASSERT_EQ(merge_new(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -646,11 +657,11 @@ static bool test_vma_merge_with_close(void) * proceed. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); vma->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -674,11 +685,11 @@ static bool test_vma_merge_with_close(void) * proceed. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vma_flags); vma->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); /* @@ -702,12 +713,12 @@ static bool test_vma_merge_with_close(void) * PPPVVNNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vma_flags); vma->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -728,12 +739,12 @@ static bool test_vma_merge_with_close(void) * PPPPPNNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vma_flags); vma_next->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -750,15 +761,16 @@ static bool test_vma_merge_with_close(void) static bool test_vma_merge_new_with_close(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { .mm = &mm, .vmi = &vmi, }; - struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); - struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, vm_flags); + struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, vma_flags); + struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, vma_flags); const struct vm_operations_struct vm_ops = { .close = dummy_close, }; @@ -788,7 +800,7 @@ static bool test_vma_merge_new_with_close(void) vma_prev->vm_ops = &vm_ops; vma_next->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x2000, 0x5000, 2, vm_flags); + vmg_set_range(&vmg, 0x2000, 0x5000, 2, vma_flags); vma = merge_new(&vmg); ASSERT_NE(vma, NULL); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); @@ -805,9 +817,10 @@ static bool test_vma_merge_new_with_close(void) static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bool next_is_sticky) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; - vm_flags_t prev_flags = vm_flags; - vm_flags_t next_flags = vm_flags; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); + vma_flags_t prev_flags = vma_flags; + vma_flags_t next_flags = vma_flags; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -821,11 +834,11 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo struct anon_vma_chain avc = {}; if (prev_is_sticky) - prev_flags |= VM_STICKY; + vma_flags_set_mask(&prev_flags, VMA_STICKY_FLAGS); if (middle_is_sticky) - vm_flags |= VM_STICKY; + vma_flags_set_mask(&vma_flags, VMA_STICKY_FLAGS); if (next_is_sticky) - next_flags |= VM_STICKY; + vma_flags_set_mask(&next_flags, VMA_STICKY_FLAGS); /* * Merge right case - partial span. @@ -837,11 +850,11 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo * 0123456789 * VNNNNNN */ - vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vma_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vma_flags, &dummy_anon_vma); vmg.middle = vma; vmg.prev = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -858,7 +871,7 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 2); if (middle_is_sticky || next_is_sticky) - ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma_next->flags, VMA_STICKY_FLAGS)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -873,10 +886,10 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo * 0123456789 * NNNNNNN */ - vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vma_flags); vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vma_flags, &dummy_anon_vma); vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_next); @@ -888,7 +901,7 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 1); if (middle_is_sticky || next_is_sticky) - ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma_next->flags, VMA_STICKY_FLAGS)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -905,9 +918,9 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo */ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vma_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vma_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -924,7 +937,7 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); if (prev_is_sticky || middle_is_sticky) - ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma_prev->flags, VMA_STICKY_FLAGS)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -941,8 +954,8 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo */ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vma_flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vma_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -955,7 +968,7 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); if (prev_is_sticky || middle_is_sticky) - ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma_prev->flags, VMA_STICKY_FLAGS)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -972,9 +985,9 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo */ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vma_flags); vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, next_flags); - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vma_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -987,7 +1000,7 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); if (prev_is_sticky || middle_is_sticky || next_is_sticky) - ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); + ASSERT_TRUE(vma_flags_test_any_mask(&vma_prev->flags, VMA_STICKY_FLAGS)); /* Clear down and reset. We should have deleted prev and next. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1008,40 +1021,40 @@ static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bo */ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vma_flags); vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, next_flags); - vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags); + vmg_set_range(&vmg, 0x4000, 0x5000, 4, vma_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags); + vmg_set_range(&vmg, 0x5000, 0x6000, 5, vma_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x6000, 0x7000, 6, vm_flags); + vmg_set_range(&vmg, 0x6000, 0x7000, 6, vma_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x4000, 0x7000, 4, vm_flags); + vmg_set_range(&vmg, 0x4000, 0x7000, 4, vma_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x4000, 0x6000, 4, vm_flags); + vmg_set_range(&vmg, 0x4000, 0x6000, 4, vma_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags); + vmg_set_range(&vmg, 0x5000, 0x6000, 5, vma_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); @@ -1067,7 +1080,8 @@ static bool test_merge_existing(void) static bool test_anon_vma_non_mergeable(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -1091,9 +1105,9 @@ static bool test_anon_vma_non_mergeable(void) * 0123456789 * PPPPPPPNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vma_flags); /* * Give both prev and next single anon_vma_chain fields, so they will @@ -1101,7 +1115,7 @@ static bool test_anon_vma_non_mergeable(void) * * However, when prev is compared to next, the merge should fail. */ - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vma_flags, NULL); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); @@ -1129,10 +1143,10 @@ static bool test_anon_vma_non_mergeable(void) * 0123456789 * PPPPPPPNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vma_flags); - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vma_flags, NULL); vmg.prev = vma_prev; vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); __vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2); @@ -1154,7 +1168,8 @@ static bool test_anon_vma_non_mergeable(void) static bool test_dup_anon_vma(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -1175,11 +1190,11 @@ static bool test_dup_anon_vma(void) * This covers new VMA merging, as these operations amount to a VMA * expand. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); vma_next->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0, 0x5000, 0, vm_flags); + vmg_set_range(&vmg, 0, 0x5000, 0, vma_flags); vmg.target = vma_prev; vmg.next = vma_next; @@ -1201,16 +1216,16 @@ static bool test_dup_anon_vma(void) * extend delete delete */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vma_flags); /* Initialise avc so mergeability check passes. */ INIT_LIST_HEAD(&vma_next->anon_vma_chain); list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain); vma_next->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1234,12 +1249,12 @@ static bool test_dup_anon_vma(void) * extend delete delete */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vma_flags); vmg.anon_vma = &dummy_anon_vma; vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1263,11 +1278,11 @@ static bool test_dup_anon_vma(void) * extend shrink/delete */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vma_flags); vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1291,11 +1306,11 @@ static bool test_dup_anon_vma(void) * shrink/delete extend */ - vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags); + vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vma_flags); vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); - vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma; vmg.middle = vma; @@ -1314,7 +1329,8 @@ static bool test_dup_anon_vma(void) static bool test_vmi_prealloc_fail(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -1330,11 +1346,11 @@ static bool test_vmi_prealloc_fail(void) * the duplicated anon_vma is unlinked. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); vma->anon_vma = &dummy_anon_vma; - vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, vm_flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, vma_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -1358,11 +1374,11 @@ static bool test_vmi_prealloc_fail(void) * performed in this case too. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vma_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); vma->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0, 0x5000, 3, vm_flags); + vmg_set_range(&vmg, 0, 0x5000, 3, vma_flags); vmg.target = vma_prev; vmg.next = vma; @@ -1380,13 +1396,14 @@ static bool test_vmi_prealloc_fail(void) static bool test_merge_extend(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0x1000); struct vm_area_struct *vma; - vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, vm_flags); - alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); + vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, vma_flags); + alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vma_flags); /* * Extend a VMA into the gap between itself and the following VMA. @@ -1410,11 +1427,13 @@ static bool test_merge_extend(void) static bool test_expand_only_mode(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); + vm_flags_t legacy_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma_prev, *vma; - VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vm_flags, 5); + VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, legacy_flags, 5); /* * Place a VMA prior to the one we're expanding so we assert that we do @@ -1422,14 +1441,14 @@ static bool test_expand_only_mode(void) * have, through the use of the just_expand flag, indicated we do not * need to do so. */ - alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); + alloc_and_link_vma(&mm, 0, 0x2000, 0, vma_flags); /* * We will be positioned at the prev VMA, but looking to expand to * 0x9000. */ vma_iter_set(&vmi, 0x3000); - vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); vmg.prev = vma_prev; vmg.just_expand = true; diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index 4a7b11a8a285..b2f068c3d6d0 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -22,7 +22,8 @@ static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags) static bool test_copy_vma(void) { - vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; bool need_locks = false; VMA_ITERATOR(vmi, &mm, 0); @@ -30,7 +31,7 @@ static bool test_copy_vma(void) /* Move backwards and do not merge. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vma_flags); vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); ASSERT_NE(vma_new, vma); ASSERT_EQ(vma_new->vm_start, 0); @@ -42,8 +43,8 @@ static bool test_copy_vma(void) /* Move a VMA into position next to another and merge the two. */ - vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags); + vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vma_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vma_flags); vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); vma_assert_attached(vma_new); @@ -61,7 +62,6 @@ static bool test_vma_flags_unchanged(void) struct vm_area_struct vma; struct vm_area_desc desc; - vma.flags = EMPTY_VMA_FLAGS; desc.vma_flags = EMPTY_VMA_FLAGS; From 7ec1885a7e283caaf6566aedc1eea5988d545f97 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:22 +0000 Subject: [PATCH 313/369] mm/vma: use new VMA flags for sticky flags logic Use the new vma_flags_t flags implementation to perform the logic around sticky flags and what flags are ignored on VMA merge. We make use of the new vma_flags_empty(), vma_flags_diff_pair(), and vma_flags_and_mask() functionality. Also update the VMA tests accordingly. Link: https://lkml.kernel.org/r/369574f06360ffa44707047e3b58eb4897345fba.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++-------- mm/vma.c | 48 ++++++++++++++++++++++-------- tools/testing/vma/include/custom.h | 5 ---- tools/testing/vma/include/dup.h | 9 ++++-- 4 files changed, 62 insertions(+), 32 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7954a7a2b811..d7e647e31742 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -540,6 +540,7 @@ enum { /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) /* * Special vmas that are non-mergable, non-mlock()able. @@ -585,27 +586,32 @@ enum { * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * - * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its - * references cleared via /proc/$pid/clear_refs, any merged VMA - * should be considered soft-dirty also as it operates at a VMA - * granularity. + * VMA_SOFTDIRTY_BIT - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any + * merged VMA should be considered soft-dirty also as it + * operates at a VMA granularity. * - * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that - * mapped page tables may contain metadata not described by the - * VMA and thus any merged VMA may also contain this metadata, - * and thus we must make this flag sticky. + * VMA_MAYBE_GUARD_BIT - If a VMA may have guard regions in place it implies + * that mapped page tables may contain metadata not + * described by the VMA and thus any merged VMA may also + * contain this metadata, and thus we must make this flag + * sticky. */ -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_STICKY - When merging VMAs, VMA flags must match, unless they are - * 'sticky'. If any sticky flags exist in either VMA, we simply - * set all of them on the merged VMA. + * VMA_STICKY_FLAGS - When merging VMAs, VMA flags must match, unless they + * are 'sticky'. If any sticky flags exist in either VMA, + * we simply set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE VM_STICKY +#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS /* * Flags which should result in page tables being copied on fork. These are diff --git a/mm/vma.c b/mm/vma.c index 4d21e7d8e93c..6af26619e020 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -86,10 +86,15 @@ static bool vma_is_fork_child(struct vm_area_struct *vma) static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; + vma_flags_t diff; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; - if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) + + diff = vma_flags_diff_pair(&vma->flags, &vmg->vma_flags); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + + if (!vma_flags_empty(&diff)) return false; if (vma->vm_file != vmg->file) return false; @@ -805,7 +810,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { - vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; + vma_flags_t sticky_flags = vma_flags_and_mask(&vmg->vma_flags, + VMA_STICKY_FLAGS); struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; @@ -898,15 +904,22 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( vma_start_write(middle); if (merge_right) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->target = next; - sticky_flags |= (next->vm_flags & VM_STICKY); + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (merge_left) { + vma_flags_t prev_sticky; + vma_start_write(prev); vmg->target = prev; - sticky_flags |= (prev->vm_flags & VM_STICKY); + + prev_sticky = vma_flags_and_mask(&prev->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, prev_sticky); } if (merge_both) { @@ -976,7 +989,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; - vm_flags_set(vmg->target, sticky_flags); + vma_set_flags_mask(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1154,12 +1167,16 @@ int vma_expand(struct vma_merge_struct *vmg) struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; bool remove_next = false; - vm_flags_t sticky_flags; + vma_flags_t sticky_flags = + vma_flags_and_mask(&vmg->vma_flags, VMA_STICKY_FLAGS); + vma_flags_t target_sticky; int ret = 0; mmap_assert_write_locked(vmg->mm); vma_start_write(target); + target_sticky = vma_flags_and_mask(&target->flags, VMA_STICKY_FLAGS); + if (next && target != next && vmg->end == next->vm_end) remove_next = true; @@ -1174,10 +1191,7 @@ int vma_expand(struct vma_merge_struct *vmg) VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); - sticky_flags = vmg->vm_flags & VM_STICKY; - sticky_flags |= target->vm_flags & VM_STICKY; - if (remove_next) - sticky_flags |= next->vm_flags & VM_STICKY; + vma_flags_set_mask(&sticky_flags, target_sticky); /* * If we are removing the next VMA or copying from a VMA @@ -1194,13 +1208,18 @@ int vma_expand(struct vma_merge_struct *vmg) return ret; if (remove_next) { + vma_flags_t next_sticky; + vma_start_write(next); vmg->__remove_next = true; + + next_sticky = vma_flags_and_mask(&next->flags, VMA_STICKY_FLAGS); + vma_flags_set_mask(&sticky_flags, next_sticky); } if (commit_merge(vmg)) goto nomem; - vm_flags_set(target, sticky_flags); + vma_set_flags_mask(target, sticky_flags); return 0; nomem: @@ -1950,10 +1969,15 @@ out: */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { + vma_flags_t diff = vma_flags_diff_pair(&a->flags, &b->flags); + + vma_flags_clear_mask(&diff, VMA_ACCESS_FLAGS); + vma_flags_clear_mask(&diff, VMA_IGNORE_MERGE_FLAGS); + return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && + vma_flags_empty(&diff) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 6200f938e586..7cdd0f60600a 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -134,8 +134,3 @@ static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) -#ifdef CONFIG_MEM_SOFT_DIRTY -#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) -#else -#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) -#endif diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 1dee78c34872..65134303b645 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -338,6 +338,7 @@ enum { /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) +#define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) /* * Special vmas that are non-mergable, non-mlock()able. @@ -363,9 +364,13 @@ enum { #define CAP_IPC_LOCK 14 -#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_SOFTDIRTY_BIT, VMA_MAYBE_GUARD_BIT) +#else +#define VMA_STICKY_FLAGS mk_vma_flags(VMA_MAYBE_GUARD_BIT) +#endif -#define VM_IGNORE_MERGE VM_STICKY +#define VMA_IGNORE_MERGE_FLAGS VMA_STICKY_FLAGS #define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) From 06531d2bf352e26c620003c8420e2cea91009293 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:23 +0000 Subject: [PATCH 314/369] tools/testing/vma: fix VMA flag tests The VMA tests are incorrectly referencing NUM_VMA_FLAGS, which doesn't exist, rather they should reference NUM_VMA_FLAG_BITS. Additionally, remove the custom-written implementation of __mk_vma_flags() as this means we are not testing the code as present in the kernel, rather add the actual __mk_vma_flags() to dup.h and add #ifdef's to handle declarations differently depending on NUM_VMA_FLAG_BITS. Link: https://lkml.kernel.org/r/b19c63af3d5efdfe712bf5d5f89368a5360a60f7.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/include/custom.h | 19 ------- tools/testing/vma/include/dup.h | 21 ++++++- tools/testing/vma/tests/vma.c | 88 +++++++++++++++++++++++++----- 3 files changed, 92 insertions(+), 36 deletions(-) diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 7cdd0f60600a..8f33df02816a 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -29,8 +29,6 @@ extern unsigned long dac_mmap_min_addr; */ #define pr_warn_once pr_err -#define pgtable_supports_soft_dirty() 1 - struct anon_vma { struct anon_vma *root; struct rb_root_cached rb_root; @@ -99,23 +97,6 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) refcount_set(&vma->vm_refcnt, 0); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) -{ - vma_flags_t flags; - int i; - - /* - * For testing purposes: allow invalid bit specification so we can - * easily test. - */ - vma_flags_clear_all(&flags); - for (i = 0; i < count; i++) - if (bits[i] < NUM_VMA_FLAG_BITS) - vma_flags_set_flag(&flags, bits[i]); - return flags; -} - static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 65134303b645..3005e33d1ede 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -854,10 +854,21 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, vma_flags_clear_word(&vma->flags, flags); } -static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits); +static __always_inline vma_flags_t __mk_vma_flags(size_t count, + const vma_flag_t *bits) +{ + vma_flags_t flags; + int i; -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) + vma_flags_clear_all(&flags); + for (i = 0; i < count; i++) + vma_flags_set_flag(&flags, bits[i]); + + return flags; +} + +#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ + (const vma_flag_t []){__VA_ARGS__}) static __always_inline bool vma_flags_test(const vma_flags_t *flags, vma_flag_t bit) @@ -1390,3 +1401,7 @@ static inline int get_sysctl_max_map_count(void) { return READ_ONCE(sysctl_max_map_count); } + +#ifndef pgtable_supports_soft_dirty +#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) +#endif diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index b2f068c3d6d0..feea6d270233 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -5,11 +5,11 @@ static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags) const unsigned long legacy_val = legacy_flags; /* The lower word should contain the precise same value. */ const unsigned long flags_lower = flags.__vma_flags[0]; -#if NUM_VMA_FLAGS > BITS_PER_LONG +#if NUM_VMA_FLAG_BITS > BITS_PER_LONG int i; /* All bits in higher flag values should be zero. */ - for (i = 1; i < NUM_VMA_FLAGS / BITS_PER_LONG; i++) { + for (i = 1; i < NUM_VMA_FLAG_BITS / BITS_PER_LONG; i++) { if (flags.__vma_flags[i] != 0) return false; } @@ -116,6 +116,7 @@ static bool test_vma_flags_cleared(void) return true; } +#if NUM_VMA_FLAG_BITS > 64 /* * Assert that VMA flag functions that operate at the system word level function * correctly. @@ -124,10 +125,14 @@ static bool test_vma_flags_word(void) { vma_flags_t flags = EMPTY_VMA_FLAGS; const vma_flags_t comparison = - mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, 64, 65); + mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT + + , 64, 65 + ); /* Set some custom high flags. */ vma_flags_set(&flags, 64, 65); + /* Now overwrite the first word. */ vma_flags_overwrite_word(&flags, VM_READ | VM_WRITE); /* Ensure they are equal. */ @@ -158,12 +163,17 @@ static bool test_vma_flags_word(void) return true; } +#endif /* NUM_VMA_FLAG_BITS > 64 */ /* Ensure that vma_flags_test() and friends works correctly. */ static bool test_vma_flags_test(void) { const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, - VMA_EXEC_BIT, 64, 65); + VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); struct vm_area_desc desc = { .vma_flags = flags, }; @@ -198,7 +208,11 @@ static bool test_vma_flags_test(void) static bool test_vma_flags_test_any(void) { const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, - VMA_EXEC_BIT, 64, 65); + VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); struct vm_area_struct vma; struct vm_area_desc desc; @@ -224,10 +238,12 @@ static bool test_vma_flags_test_any(void) do_test(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT); /* However, the ...test_all() variant should NOT pass. */ do_test_all_false(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT); +#if NUM_VMA_FLAG_BITS > 64 /* But should pass for flags present. */ do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); /* Also subsets... */ do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64); +#endif do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT); do_test_all_true(VMA_READ_BIT); @@ -291,8 +307,16 @@ static bool test_vma_flags_test_any(void) static bool test_vma_flags_clear(void) { vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, - VMA_EXEC_BIT, 64, 65); - vma_flags_t mask = mk_vma_flags(VMA_EXEC_BIT, 64); + VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); + vma_flags_t mask = mk_vma_flags(VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64 +#endif + ); struct vm_area_struct vma; struct vm_area_desc desc; @@ -303,6 +327,7 @@ static bool test_vma_flags_clear(void) vma_flags_clear_mask(&flags, mask); vma_flags_clear_mask(&vma.flags, mask); vma_desc_clear_flags_mask(&desc, mask); +#if NUM_VMA_FLAG_BITS > 64 ASSERT_FALSE(vma_flags_test_any(&flags, VMA_EXEC_BIT, 64)); ASSERT_FALSE(vma_flags_test_any(&vma.flags, VMA_EXEC_BIT, 64)); ASSERT_FALSE(vma_desc_test_any(&desc, VMA_EXEC_BIT, 64)); @@ -310,6 +335,7 @@ static bool test_vma_flags_clear(void) vma_flags_set(&flags, VMA_EXEC_BIT, 64); vma_set_flags(&vma, VMA_EXEC_BIT, 64); vma_desc_set_flags(&desc, VMA_EXEC_BIT, 64); +#endif /* * Clear the flags and assert clear worked, then reset flags back to @@ -330,20 +356,27 @@ static bool test_vma_flags_clear(void) do_test_and_reset(VMA_READ_BIT); do_test_and_reset(VMA_WRITE_BIT); do_test_and_reset(VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 do_test_and_reset(64); do_test_and_reset(65); +#endif /* Two flags, in different orders. */ do_test_and_reset(VMA_READ_BIT, VMA_WRITE_BIT); do_test_and_reset(VMA_READ_BIT, VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 do_test_and_reset(VMA_READ_BIT, 64); do_test_and_reset(VMA_READ_BIT, 65); +#endif do_test_and_reset(VMA_WRITE_BIT, VMA_READ_BIT); do_test_and_reset(VMA_WRITE_BIT, VMA_EXEC_BIT); +#if NUM_VMA_FLAG_BITS > 64 do_test_and_reset(VMA_WRITE_BIT, 64); do_test_and_reset(VMA_WRITE_BIT, 65); +#endif do_test_and_reset(VMA_EXEC_BIT, VMA_READ_BIT); do_test_and_reset(VMA_EXEC_BIT, VMA_WRITE_BIT); +#if NUM_VMA_FLAG_BITS > 64 do_test_and_reset(VMA_EXEC_BIT, 64); do_test_and_reset(VMA_EXEC_BIT, 65); do_test_and_reset(64, VMA_READ_BIT); @@ -354,6 +387,7 @@ static bool test_vma_flags_clear(void) do_test_and_reset(65, VMA_WRITE_BIT); do_test_and_reset(65, VMA_EXEC_BIT); do_test_and_reset(65, 64); +#endif /* Three flags. */ @@ -367,7 +401,11 @@ static bool test_vma_flags_clear(void) static bool test_vma_flags_empty(void) { vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, - VMA_EXEC_BIT, 64, 65); + VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); ASSERT_FLAGS_NONEMPTY(&flags); vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT); @@ -386,10 +424,19 @@ static bool test_vma_flags_empty(void) static bool test_vma_flags_diff(void) { vma_flags_t flags1 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, - VMA_EXEC_BIT, 64, 65); + VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); + vma_flags_t flags2 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, VMA_MAYWRITE_BIT, - VMA_MAYEXEC_BIT, 64, 65, 66, 67); + VMA_MAYEXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65, 66, 67 +#endif + ); vma_flags_t diff = vma_flags_diff_pair(&flags1, &flags2); #if NUM_VMA_FLAG_BITS > 64 @@ -432,12 +479,23 @@ static bool test_vma_flags_diff(void) static bool test_vma_flags_and(void) { vma_flags_t flags1 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, - VMA_EXEC_BIT, 64, 65); + VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); vma_flags_t flags2 = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, VMA_MAYWRITE_BIT, - VMA_MAYEXEC_BIT, 64, 65, 66, 67); - vma_flags_t flags3 = mk_vma_flags(VMA_IO_BIT, VMA_MAYBE_GUARD_BIT, - 68, 69); + VMA_MAYEXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65, 66, 67 +#endif + ); + vma_flags_t flags3 = mk_vma_flags(VMA_IO_BIT, VMA_MAYBE_GUARD_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 68, 69 +#endif + ); vma_flags_t and = vma_flags_and_mask(&flags1, flags2); #if NUM_VMA_FLAG_BITS > 64 @@ -502,7 +560,9 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(copy_vma); TEST(vma_flags_unchanged); TEST(vma_flags_cleared); +#if NUM_VMA_FLAG_BITS > 64 TEST(vma_flags_word); +#endif TEST(vma_flags_test); TEST(vma_flags_test_any); TEST(vma_flags_clear); From e8d464f4a94ccbcae8c9d3137ac5621b57ddd8a1 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:24 +0000 Subject: [PATCH 315/369] mm/vma: add append_vma_flags() helper In order to be able to efficiently combine VMA flag masks with additional VMA flag bits we need to extend the concept introduced in mk_vma_flags() and __mk_vma_flags() by allowing the specification of a VMA flag mask to append VMA flag bits to. Update __mk_vma_flags() to allow for this and update mk_vma_flags() accordingly, and also provide append_vma_flags() to allow for the caller to specify which VMA flags mask to append to. Finally, update the VMA flags tests to reflect the change. Link: https://lkml.kernel.org/r/9f928cd4688270002f2c0c3777fcc9b49cc7a8ea.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 20 ++++++++++++++------ tools/testing/vma/include/dup.h | 14 +++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e647e31742..26cfb2fbe4db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1042,13 +1042,11 @@ static __always_inline void vma_flags_set_flag(vma_flags_t *flags, __set_bit((__force int)bit, bitmap); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, + size_t count, const vma_flag_t *bits) { - vma_flags_t flags; int i; - vma_flags_clear_all(&flags); for (i = 0; i < count; i++) vma_flags_set_flag(&flags, bits[i]); return flags; @@ -1064,8 +1062,18 @@ static __always_inline vma_flags_t __mk_vma_flags(size_t count, * The compiler cleverly optimises away all of the work and this ends up being * equivalent to aggregating the values manually. */ -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) + +/* + * Helper macro which acts like mk_vma_flags, only appending to a copy of the + * specified flags rather than establishing new flags. E.g.: + * + * vma_flags_t flags = append_vma_flags(VMA_STACK_DEFAULT_FLAGS, VMA_STACK_BIT, + * VMA_ACCOUNT_BIT); + */ +#define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) /* * Test whether a specific VMA flag is set, e.g.: diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 3005e33d1ede..a2f311b5ea82 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -854,21 +854,21 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, vma_flags_clear_word(&vma->flags, flags); } -static __always_inline vma_flags_t __mk_vma_flags(size_t count, - const vma_flag_t *bits) +static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, + size_t count, const vma_flag_t *bits) { - vma_flags_t flags; int i; - vma_flags_clear_all(&flags); for (i = 0; i < count; i++) vma_flags_set_flag(&flags, bits[i]); - return flags; } -#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \ - (const vma_flag_t []){__VA_ARGS__}) +#define mk_vma_flags(...) __mk_vma_flags(EMPTY_VMA_FLAGS, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) + +#define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ + COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) static __always_inline bool vma_flags_test(const vma_flags_t *flags, vma_flag_t bit) From b22a48ec095e4777a0acb7b2f64ee36d3e60ba9b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:25 +0000 Subject: [PATCH 316/369] tools/testing/vma: add simple test for append_vma_flags() Add a simple test for append_vma_flags() to assert that it behaves as expected. Additionally, include the VMA_REMAP_FLAGS definition in the VMA tests to allow us to use this value in the testing. Link: https://lkml.kernel.org/r/eebd946c5325ad7fae93027245a562eb1aeb68a2.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/include/dup.h | 3 +++ tools/testing/vma/tests/vma.c | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index a2f311b5ea82..802b3d97b627 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -345,6 +345,9 @@ enum { */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +#define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \ + VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT) + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW #define TASK_SIZE_MAX DEFAULT_MAP_WINDOW diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index feea6d270233..98e465fb1bf2 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -555,6 +555,30 @@ static bool test_vma_flags_and(void) return true; } +/* Ensure append_vma_flags() acts as expected. */ +static bool test_append_vma_flags(void) +{ + vma_flags_t flags = append_vma_flags(VMA_REMAP_FLAGS, VMA_READ_BIT, + VMA_WRITE_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); + + ASSERT_FLAGS_SAME(&flags, VMA_IO_BIT, VMA_PFNMAP_BIT, + VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT, VMA_READ_BIT, + VMA_WRITE_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); + + flags = append_vma_flags(EMPTY_VMA_FLAGS, VMA_READ_BIT, VMA_WRITE_BIT); + ASSERT_FLAGS_SAME(&flags, VMA_READ_BIT, VMA_WRITE_BIT); + + return true; +} + static void run_vma_tests(int *num_tests, int *num_fail) { TEST(copy_vma); @@ -569,4 +593,5 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(vma_flags_empty); TEST(vma_flags_diff); TEST(vma_flags_and); + TEST(append_vma_flags); } From 5fb55e951cf591c5e2d45273ceadbdcd0c44932c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:26 +0000 Subject: [PATCH 317/369] mm: unexport vm_brk_flags() and eliminate vm_flags parameter This function is only used by elf_load(), and that is a static function that doesn't need an exported symbol to invoke an internal function, so un-EXPORT_SYMBOLS() it. Also, the vm_flags parameter is unnecessary, as we only ever set VM_EXEC, so simply make this parameter a boolean. While we're here, clean up the mm.h definitions for the various vm_xxx() helpers so we actually specify parameter names and elide the redundant extern's. Link: https://lkml.kernel.org/r/7bada48ddf3f9dbd3e6c4fc50ec2f4de97706f52.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- fs/binfmt_elf.c | 3 +-- include/linux/mm.h | 12 ++++++------ mm/mmap.c | 8 ++------ 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fb857faaf0d6..16a56b6b3f6c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -453,14 +453,13 @@ static unsigned long elf_load(struct file *filep, unsigned long addr, zero_end = ELF_PAGEALIGN(zero_end); error = vm_brk_flags(zero_start, zero_end - zero_start, - prot & PROT_EXEC ? VM_EXEC : 0); + prot & PROT_EXEC); if (error) map_addr = error; } return map_addr; } - static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; diff --git a/include/linux/mm.h b/include/linux/mm.h index 26cfb2fbe4db..5b85ffc2760c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3991,12 +3991,12 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ -extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); -extern int vm_munmap(unsigned long, size_t); -extern unsigned long __must_check vm_mmap(struct file *, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, +int __must_check vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec); +int vm_munmap(unsigned long start, size_t len); +unsigned long __must_check vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset); +unsigned long __must_check vm_mmap_shadow_stack(unsigned long addr, unsigned long len, unsigned long flags); struct vm_unmapped_area_info { diff --git a/mm/mmap.c b/mm/mmap.c index 79544d893411..2d2b814978bf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1201,8 +1201,9 @@ out: return ret; } -int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) +int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) { + const vm_flags_t vm_flags = is_exec ? VM_EXEC : 0; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; @@ -1217,10 +1218,6 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) if (!len) return 0; - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((vm_flags & (~VM_EXEC)) != 0) - return -EINVAL; - if (mmap_write_lock_killable(mm)) return -EINTR; @@ -1246,7 +1243,6 @@ limits_failed: mmap_write_unlock(mm); return ret; } -EXPORT_SYMBOL(vm_brk_flags); static unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, From 3ee584538259c356c66146ac46f2e4fd2ba28bee Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:27 +0000 Subject: [PATCH 318/369] mm/vma: introduce vma_flags_same[_mask/_pair]() Add helpers to determine if two sets of VMA flags are precisely the same, that is - that every flag set one is set in another, and neither contain any flags not set in the other. We also introduce vma_flags_same_pair() for cases where we want to compare two sets of VMA flags which are both non-const values. Also update the VMA tests to reflect the change, we already implicitly test that this functions correctly having used it for testing purposes previously. Link: https://lkml.kernel.org/r/4f764bf619e77205837c7c819b62139ef6337ca3.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ tools/testing/vma/include/custom.h | 11 ----------- tools/testing/vma/include/dup.h | 21 +++++++++++++++++++++ 3 files changed, 49 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b85ffc2760c..1f3e9100164d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,6 +1202,34 @@ static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, return dst; } +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other->__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* Determine if flags and flags_other have precisely the same flags set. */ +static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +/* + * Helper macro to determine if only the specific flags are set, e.g.: + * + * if (vma_flags_same(&flags, VMA_WRITE_BIT) { ... } + */ +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 8f33df02816a..2c498e713fbd 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -102,16 +102,5 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return PAGE_SIZE; } -/* Place here until needed in the kernel code. */ -static __always_inline bool vma_flags_same_mask(vma_flags_t *flags, - vma_flags_t flags_other) -{ - const unsigned long *bitmap = flags->__vma_flags; - const unsigned long *bitmap_other = flags_other.__vma_flags; - - return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); -} -#define vma_flags_same(flags, ...) \ - vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) #define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 802b3d97b627..65f630923461 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -954,6 +954,27 @@ static __always_inline vma_flags_t vma_flags_diff_pair(const vma_flags_t *flags, return dst; } +static __always_inline bool vma_flags_same_pair(const vma_flags_t *flags, + const vma_flags_t *flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other->__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, + vma_flags_t flags_other) +{ + const unsigned long *bitmap = flags->__vma_flags; + const unsigned long *bitmap_other = flags_other.__vma_flags; + + return bitmap_equal(bitmap, bitmap_other, NUM_VMA_FLAG_BITS); +} + +#define vma_flags_same(flags, ...) \ + vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) + static inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { From c8555bc95d6222aa729b3a1195e07e566707ec02 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:28 +0000 Subject: [PATCH 319/369] mm/vma: introduce [vma_flags,legacy]_to_[legacy,vma_flags]() helpers While we are still converting VMA flags from vma_flags_t to vm_flags_t, introduce helpers to convert between the two to allow for iterative development without having to 'change the world' in a single commit'. Also update VMA flags tests to reflect the change. Finally, refresh vma_flags_overwrite_word(), vma_flag_overwrite_word_once(), vma_flags_set_word() and vma_flags_clear_word() in the VMA tests to reflect current kernel implementations - this should make no functional difference, but keeps the logic consistent between the two. Link: https://lkml.kernel.org/r/d3569470dbb3ae79134ca7c3eb3fc4df7086e874.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 26 ++++++++++++++++++++++++ tools/testing/vma/include/dup.h | 36 +++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8ef84849953f..1da8fb04133f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1069,6 +1069,18 @@ static __always_inline void vma_flags_clear_all(vma_flags_t *flags) bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS); } +/* + * Helper function which converts a vma_flags_t value to a legacy vm_flags_t + * value. This is only valid if the input flags value can be expressed in a + * system word. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) +{ + return (vm_flags_t)flags.__vma_flags[0]; +} + /* * Copy value to the first system word of VMA flags, non-atomically. * @@ -1082,6 +1094,20 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va bitmap[0] = value; } +/* + * Helper function which converts a legacy vm_flags_t value to a vma_flags_t + * value. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) +{ + vma_flags_t ret = EMPTY_VMA_FLAGS; + + vma_flags_overwrite_word(&ret, flags); + return ret; +} + /* * Copy value to the first system word of VMA flags ONCE, non-atomically. * diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 65f630923461..f49af21319ba 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -766,7 +766,9 @@ static inline bool mm_flags_test(int flag, const struct mm_struct *mm) */ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) { - *ACCESS_PRIVATE(flags, __vma_flags) = value; + unsigned long *bitmap = flags->__vma_flags; + + bitmap[0] = value; } /* @@ -777,7 +779,7 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va */ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; WRITE_ONCE(*bitmap, value); } @@ -785,7 +787,7 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo /* Update the first system word of VMA flags setting bits, non-atomically. */ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap |= value; } @@ -793,7 +795,7 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) /* Update the first system word of VMA flags clearing bits, non-atomically. */ static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + unsigned long *bitmap = flags->__vma_flags; *bitmap &= ~value; } @@ -803,6 +805,32 @@ static __always_inline void vma_flags_clear_all(vma_flags_t *flags) bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); } +/* + * Helper function which converts a vma_flags_t value to a legacy vm_flags_t + * value. This is only valid if the input flags value can be expressed in a + * system word. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) +{ + return (vm_flags_t)flags.__vma_flags[0]; +} + +/* + * Helper function which converts a legacy vm_flags_t value to a vma_flags_t + * value. + * + * Will be removed once the conversion to VMA flags is complete. + */ +static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) +{ + vma_flags_t ret = EMPTY_VMA_FLAGS; + + vma_flags_overwrite_word(&ret, flags); + return ret; +} + static __always_inline void vma_flags_set_flag(vma_flags_t *flags, vma_flag_t bit) { From a8add93f805b1e36f056b214568aa131fd6e4cbd Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:29 +0000 Subject: [PATCH 320/369] tools/testing/vma: test that legacy flag helpers work correctly Update the existing compare_legacy_flags() predicate function to assert that legacy_to_vma_flags() and vma_flags_to_legacy() behave as expected. Link: https://lkml.kernel.org/r/3374e50053adb65818fde948ae3488e1e29ae8b1.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/tests/vma.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index 98e465fb1bf2..1fae25170ff7 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -5,6 +5,7 @@ static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags) const unsigned long legacy_val = legacy_flags; /* The lower word should contain the precise same value. */ const unsigned long flags_lower = flags.__vma_flags[0]; + vma_flags_t converted_flags; #if NUM_VMA_FLAG_BITS > BITS_PER_LONG int i; @@ -17,6 +18,11 @@ static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags) static_assert(sizeof(legacy_flags) == sizeof(unsigned long)); + /* Assert that legacy flag helpers work correctly. */ + converted_flags = legacy_to_vma_flags(legacy_flags); + ASSERT_FLAGS_SAME_MASK(&converted_flags, flags); + ASSERT_EQ(vma_flags_to_legacy(flags), legacy_flags); + return legacy_val == flags_lower; } From fb67bba5d9b8561f433695c8916c097910193561 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:30 +0000 Subject: [PATCH 321/369] mm/vma: introduce vma_test[_any[_mask]](), and make inlining consistent Introduce helper functions and macros to make it convenient to test flags and flag masks for VMAs, specifically: * vma_test() - determine if a single VMA flag is set in a VMA. * vma_test_any_mask() - determine if any flags in a vma_flags_t value are set in a VMA. * vma_test_any() - Helper macro to test if any of specific flags are set. Also, there are a mix of 'inline's and '__always_inline's in VMA helper function declarations, update to consistently use __always_inline. Finally, update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/be1d71f08307d747a82232cbd8664a88c0f41419.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 49 +++++++++++++++++++++----- include/linux/mm_types.h | 12 ++++--- tools/testing/vma/include/dup.h | 61 +++++++++++++++++++++------------ 3 files changed, 88 insertions(+), 34 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f3e9100164d..f704d7cf2871 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,7 +994,8 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } -static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, + vma_flag_t bit) { const vm_flags_t mask = BIT((__force int)bit); @@ -1009,7 +1010,8 @@ static inline bool __vma_atomic_valid_flag(struct vm_area_struct *vma, vma_flag_ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline void vma_set_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { unsigned long *bitmap = vma->flags.__vma_flags; @@ -1025,7 +1027,8 @@ static inline void vma_set_atomic_flag(struct vm_area_struct *vma, vma_flag_t bi * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_test_atomic_flag(struct vm_area_struct *vma, vma_flag_t bit) +static __always_inline bool vma_test_atomic_flag(struct vm_area_struct *vma, + vma_flag_t bit) { if (__vma_atomic_valid_flag(vma, bit)) return test_bit((__force int)bit, &vma->vm_flags); @@ -1230,13 +1233,41 @@ static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Test whether a specific flag in the VMA is set, e.g.: + * + * if (vma_test(vma, VMA_READ_BIT)) { ... } + */ +static __always_inline bool vma_test(const struct vm_area_struct *vma, + vma_flag_t bit) +{ + return vma_flags_test(&vma->flags, bit); +} + +/* Helper to test any VMA flags in a VMA . */ +static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_any_mask(&vma->flags, flags); +} + +/* + * Helper macro for testing whether any VMA flags are set in a VMA, + * e.g.: + * + * if (vma_test_any(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, + * VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... } + */ +#define vma_test_any(vma, ...) \ + vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Helper to test that ALL specified flags are set in a VMA. * * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline bool vma_test_all_mask(const struct vm_area_struct *vma, +static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); @@ -1256,7 +1287,7 @@ static inline bool vma_test_all_mask(const struct vm_area_struct *vma, * Note: appropriate locks must be held, this function does not acquire them for * you. */ -static inline void vma_set_flags_mask(struct vm_area_struct *vma, +static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); @@ -1286,7 +1317,7 @@ static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, } /* Helper to test any VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_any_mask(&desc->vma_flags, flags); @@ -1303,7 +1334,7 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to test all VMA flags in a VMA descriptor. */ -static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_all_mask(&desc->vma_flags, flags); @@ -1319,7 +1350,7 @@ static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to set all VMA flags in a VMA descriptor. */ -static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); @@ -1336,7 +1367,7 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) /* Helper to clear all VMA flags in a VMA descriptor. */ -static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, +static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1da8fb04133f..38fe6b915024 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1087,7 +1087,8 @@ static __always_inline vm_flags_t vma_flags_to_legacy(vma_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1114,7 +1115,8 @@ static __always_inline vma_flags_t legacy_to_vma_flags(vm_flags_t flags) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word_once(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1122,7 +1124,8 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo } /* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_set_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1130,7 +1133,8 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) } /* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_clear_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index f49af21319ba..f9fe07a8a443 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -764,7 +764,8 @@ static inline bool mm_flags_test(int flag, const struct mm_struct *mm) * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -777,7 +778,8 @@ static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long va * IMPORTANT: This does not overwrite bytes past the first system word. The * caller must account for this. */ -static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_overwrite_word_once(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -785,7 +787,8 @@ static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned lo } /* Update the first system word of VMA flags setting bits, non-atomically. */ -static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_set_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -793,7 +796,8 @@ static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) } /* Update the first system word of VMA flags clearing bits, non-atomically. */ -static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +static __always_inline void vma_flags_clear_word(vma_flags_t *flags, + unsigned long value) { unsigned long *bitmap = flags->__vma_flags; @@ -1003,8 +1007,23 @@ static __always_inline bool vma_flags_same_mask(const vma_flags_t *flags, #define vma_flags_same(flags, ...) \ vma_flags_same_mask(flags, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_test_all_mask(const struct vm_area_struct *vma, - vma_flags_t flags) +static __always_inline bool vma_test(const struct vm_area_struct *vma, + vma_flag_t bit) +{ + return vma_flags_test(&vma->flags, bit); +} + +static __always_inline bool vma_test_any_mask(const struct vm_area_struct *vma, + vma_flags_t flags) +{ + return vma_flags_test_any_mask(&vma->flags, flags); +} + +#define vma_test_any(vma, ...) \ + vma_test_any_mask(vma, mk_vma_flags(__VA_ARGS__)) + +static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, + vma_flags_t flags) { return vma_flags_test_all_mask(&vma->flags, flags); } @@ -1012,14 +1031,8 @@ static inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - -static inline void vma_set_flags_mask(struct vm_area_struct *vma, - vma_flags_t flags) +static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) { vma_flags_set_mask(&vma->flags, flags); } @@ -1033,8 +1046,8 @@ static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, return vma_flags_test(&desc->vma_flags, bit); } -static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, + vma_flags_t flags) { return vma_flags_test_any_mask(&desc->vma_flags, flags); } @@ -1042,7 +1055,7 @@ static inline bool vma_desc_test_any_mask(const struct vm_area_desc *desc, #define vma_desc_test_any(desc, ...) \ vma_desc_test_any_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, +static __always_inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, vma_flags_t flags) { return vma_flags_test_all_mask(&desc->vma_flags, flags); @@ -1051,8 +1064,8 @@ static inline bool vma_desc_test_all_mask(const struct vm_area_desc *desc, #define vma_desc_test_all(desc, ...) \ vma_desc_test_all_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) { vma_flags_set_mask(&desc->vma_flags, flags); } @@ -1060,8 +1073,8 @@ static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, - vma_flags_t flags) +static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, + vma_flags_t flags) { vma_flags_clear_mask(&desc->vma_flags, flags); } @@ -1069,6 +1082,12 @@ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, #define vma_desc_clear_flags(desc, ...) \ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) +static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); From 63cdb667d1ec3683dd6b0358fce7cae7da7fffc7 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:31 +0000 Subject: [PATCH 322/369] tools/testing/vma: update VMA flag tests to test vma_test[_any_mask]() Update the existing test logic to assert that vma_test(), vma_test_any() and vma_test_any_mask() (implicitly tested via vma_test_any()) are functioning correctly. We already have tests for other variants like this, so it's simply a matter of expanding those tests to also include tests for the VMA-specific helpers. Link: https://lkml.kernel.org/r/dea3e97c6c3dd86f1a3f1a0703241b03f6e3a33f.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/tests/vma.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index 1fae25170ff7..1395d55a1e02 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -183,13 +183,18 @@ static bool test_vma_flags_test(void) struct vm_area_desc desc = { .vma_flags = flags, }; + struct vm_area_struct vma = { + .flags = flags, + }; #define do_test(_flag) \ ASSERT_TRUE(vma_flags_test(&flags, _flag)); \ + ASSERT_TRUE(vma_test(&vma, _flag)); \ ASSERT_TRUE(vma_desc_test(&desc, _flag)) #define do_test_false(_flag) \ ASSERT_FALSE(vma_flags_test(&flags, _flag)); \ + ASSERT_FALSE(vma_test(&vma, _flag)); \ ASSERT_FALSE(vma_desc_test(&desc, _flag)) do_test(VMA_READ_BIT); @@ -219,15 +224,17 @@ static bool test_vma_flags_test_any(void) , 64, 65 #endif ); - struct vm_area_struct vma; - struct vm_area_desc desc; - - vma.flags = flags; - desc.vma_flags = flags; + struct vm_area_struct vma = { + .flags = flags, + }; + struct vm_area_desc desc = { + .vma_flags = flags, + }; #define do_test(...) \ ASSERT_TRUE(vma_flags_test_any(&flags, __VA_ARGS__)); \ - ASSERT_TRUE(vma_desc_test_any(&desc, __VA_ARGS__)) + ASSERT_TRUE(vma_desc_test_any(&desc, __VA_ARGS__)); \ + ASSERT_TRUE(vma_test_any(&vma, __VA_ARGS__)); #define do_test_all_true(...) \ ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__)); \ From e79d1c500f52506b9eab39e81017e30b76f2864d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:32 +0000 Subject: [PATCH 323/369] mm: introduce vma_flags_count() and vma[_flags]_test_single_mask() vma_flags_count() determines how many bits are set in VMA flags, using bitmap_weight(). vma_flags_test_single_mask() determines if a vma_flags_t set of flags contains a single flag specified as another vma_flags_t value, or if the sought flag mask is empty, it is defined to return false. This is useful when we want to declare a VMA flag as optionally a single flag in a mask or empty depending on kernel configuration. This allows us to have VM_NONE-like semantics when checking whether the flag is set. In a subsequent patch, we introduce the use of VMA_DROPPABLE of type vma_flags_t using precisely these semantics. It would be actively confusing to use vma_flags_test_any_single_mask() for this (and vma_flags_test_all_mask() is not correct to use here, as it trivially returns true when tested against an empty vma flags mask). We introduce vma_flags_count() to be able to assert that the compared flag mask is singular or empty, checked when CONFIG_DEBUG_VM is enabled. Also update the VMA tests as part of this change. Link: https://lkml.kernel.org/r/cd778dd02b9f2a01eb54d25a49dea8ec2ddf7753.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 46 ++++++++++++++++++++++++++++++ tools/testing/vma/include/custom.h | 6 ---- tools/testing/vma/include/dup.h | 21 ++++++++++++++ tools/testing/vma/vma_internal.h | 6 ++++ 4 files changed, 73 insertions(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f704d7cf2871..de72382efac2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1078,6 +1078,14 @@ static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) +/* Calculates the number of set bits in the specified VMA flags. */ +static __always_inline int vma_flags_count(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); +} + /* * Test whether a specific VMA flag is set, e.g.: * @@ -1153,6 +1161,26 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, #define vma_flags_test_all(flags, ...) \ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is defined to make the semantics clearer when testing an optionally + * defined VMA flags mask, e.g.: + * + * if (vma_flags_test_single_mask(&flags, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, + vma_flags_t flagmask) +{ + VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); + + return vma_flags_test_any_mask(flags, flagmask); +} + /* Set each of the to_set flags in flags, non-atomically. */ static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) @@ -1281,6 +1309,24 @@ static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* + * Helper to test that a flag mask of type vma_flags_t has a SINGLE flag set + * (returning false if flagmask has no flags set). + * + * This is useful when a flag needs to be either defined or not depending upon + * kernel configuration, e.g.: + * + * if (vma_test_single_mask(vma, VMA_DROPPABLE)) { ... } + * + * When VMA_DROPPABLE is defined if available, or set to EMPTY_VMA_FLAGS + * otherwise. + */ +static __always_inline bool +vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) +{ + return vma_flags_test_single_mask(&vma->flags, flagmask); +} + /* * Helper to set all VMA flags in a VMA. * diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index 2c498e713fbd..b7d9eb0a44e4 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -15,12 +15,6 @@ extern unsigned long dac_mmap_min_addr; #define dac_mmap_min_addr 0UL #endif -#define VM_WARN_ON(_expr) (WARN_ON(_expr)) -#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) -#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) -#define VM_BUG_ON(_expr) (BUG_ON(_expr)) -#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) - #define TASK_SIZE ((1ul << 47)-PAGE_SIZE) /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index f9fe07a8a443..244ee02dc21d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -905,6 +905,13 @@ static __always_inline vma_flags_t __mk_vma_flags(vma_flags_t flags, #define append_vma_flags(flags, ...) __mk_vma_flags(flags, \ COUNT_ARGS(__VA_ARGS__), (const vma_flag_t []){__VA_ARGS__}) +static __always_inline int vma_flags_count(const vma_flags_t *flags) +{ + const unsigned long *bitmap = flags->__vma_flags; + + return bitmap_weight(bitmap, NUM_VMA_FLAG_BITS); +} + static __always_inline bool vma_flags_test(const vma_flags_t *flags, vma_flag_t bit) { @@ -952,6 +959,14 @@ static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags, #define vma_flags_test_all(flags, ...) \ vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool vma_flags_test_single_mask(const vma_flags_t *flags, + vma_flags_t flagmask) +{ + VM_WARN_ON_ONCE(vma_flags_count(&flagmask) > 1); + + return vma_flags_test_any_mask(flags, flagmask); +} + static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set) { unsigned long *bitmap = flags->__vma_flags; @@ -1031,6 +1046,12 @@ static __always_inline bool vma_test_all_mask(const struct vm_area_struct *vma, #define vma_test_all(vma, ...) \ vma_test_all_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline bool +vma_test_single_mask(const struct vm_area_struct *vma, vma_flags_t flagmask) +{ + return vma_flags_test_single_mask(&vma->flags, flagmask); +} + static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, vma_flags_t flags) { diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 0e1121e2ef23..e12ab2c80f95 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -51,6 +51,12 @@ typedef unsigned long pgprotval_t; typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef __bitwise unsigned int vm_fault_t; +#define VM_WARN_ON(_expr) (WARN_ON(_expr)) +#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr)) +#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr)) +#define VM_BUG_ON(_expr) (BUG_ON(_expr)) +#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) + #include "include/stubs.h" #include "include/dup.h" #include "include/custom.h" From bbbc17cb023018605457f455202e028cc1ce0418 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:33 +0000 Subject: [PATCH 324/369] tools/testing/vma: test vma_flags_count,vma[_flags]_test_single_mask Update the VMA tests to assert that vma_flags_count() behaves as expected, as well as vma_flags_test_single_mask() and vma_test_single_mask(). For the test functions we can simply update the existing vma_test(), et al. test to also test the single_mask variants. We also add some explicit testing of an empty VMA flag to this test to ensure this is handled properly. In order to test vma_flags_count() we simply take an existing set of flags and gradually remove flags ensuring the count remains as expected throughout. We also update the vma[_flags]_test_all() tests to make clear the semantics that we expect vma[_flags]_test_all(..., EMPTY_VMA_FLAGS) to return true, as trivially, all flags of none are always set in VMA flags. Link: https://lkml.kernel.org/r/4af95d559cd2af0ba3388de1e1386b9f94c0e009.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/tests/vma.c | 63 ++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index 1395d55a1e02..c73c3a565f1d 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -174,10 +174,10 @@ static bool test_vma_flags_word(void) /* Ensure that vma_flags_test() and friends works correctly. */ static bool test_vma_flags_test(void) { - const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, - VMA_EXEC_BIT + vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT #if NUM_VMA_FLAG_BITS > 64 - , 64, 65 + , 64, 65 #endif ); struct vm_area_desc desc = { @@ -187,14 +187,18 @@ static bool test_vma_flags_test(void) .flags = flags, }; -#define do_test(_flag) \ - ASSERT_TRUE(vma_flags_test(&flags, _flag)); \ - ASSERT_TRUE(vma_test(&vma, _flag)); \ +#define do_test(_flag) \ + ASSERT_TRUE(vma_flags_test(&flags, _flag)); \ + ASSERT_TRUE(vma_flags_test_single_mask(&flags, mk_vma_flags(_flag))); \ + ASSERT_TRUE(vma_test(&vma, _flag)); \ + ASSERT_TRUE(vma_test_single_mask(&vma, mk_vma_flags(_flag))); \ ASSERT_TRUE(vma_desc_test(&desc, _flag)) -#define do_test_false(_flag) \ - ASSERT_FALSE(vma_flags_test(&flags, _flag)); \ - ASSERT_FALSE(vma_test(&vma, _flag)); \ +#define do_test_false(_flag) \ + ASSERT_FALSE(vma_flags_test(&flags, _flag)); \ + ASSERT_FALSE(vma_flags_test_single_mask(&flags, mk_vma_flags(_flag))); \ + ASSERT_FALSE(vma_test(&vma, _flag)); \ + ASSERT_FALSE(vma_test_single_mask(&vma, mk_vma_flags(_flag))); \ ASSERT_FALSE(vma_desc_test(&desc, _flag)) do_test(VMA_READ_BIT); @@ -212,6 +216,15 @@ static bool test_vma_flags_test(void) #undef do_test #undef do_test_false + /* We define the _single_mask() variants to return false if empty. */ + ASSERT_FALSE(vma_flags_test_single_mask(&flags, EMPTY_VMA_FLAGS)); + ASSERT_FALSE(vma_test_single_mask(&vma, EMPTY_VMA_FLAGS)); + /* Even when both flags and tested flag mask are empty! */ + flags = EMPTY_VMA_FLAGS; + vma.flags = EMPTY_VMA_FLAGS; + ASSERT_FALSE(vma_flags_test_single_mask(&flags, EMPTY_VMA_FLAGS)); + ASSERT_FALSE(vma_test_single_mask(&vma, EMPTY_VMA_FLAGS)); + return true; } @@ -309,6 +322,10 @@ static bool test_vma_flags_test_any(void) do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65); #endif + /* Testing all flags against none trivially succeeds. */ + ASSERT_TRUE(vma_flags_test_all_mask(&flags, EMPTY_VMA_FLAGS)); + ASSERT_TRUE(vma_test_all_mask(&vma, EMPTY_VMA_FLAGS)); + #undef do_test #undef do_test_all_true #undef do_test_all_false @@ -592,6 +609,33 @@ static bool test_append_vma_flags(void) return true; } +/* Assert that vma_flags_count() behaves as expected. */ +static bool test_vma_flags_count(void) +{ + vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_EXEC_BIT +#if NUM_VMA_FLAG_BITS > 64 + , 64, 65 +#endif + ); + +#if NUM_VMA_FLAG_BITS > 64 + ASSERT_EQ(vma_flags_count(&flags), 5); + vma_flags_clear(&flags, 64); + ASSERT_EQ(vma_flags_count(&flags), 4); + vma_flags_clear(&flags, 65); +#endif + ASSERT_EQ(vma_flags_count(&flags), 3); + vma_flags_clear(&flags, VMA_EXEC_BIT); + ASSERT_EQ(vma_flags_count(&flags), 2); + vma_flags_clear(&flags, VMA_WRITE_BIT); + ASSERT_EQ(vma_flags_count(&flags), 1); + vma_flags_clear(&flags, VMA_READ_BIT); + ASSERT_EQ(vma_flags_count(&flags), 0); + + return true; +} + static void run_vma_tests(int *num_tests, int *num_fail) { TEST(copy_vma); @@ -607,4 +651,5 @@ static void run_vma_tests(int *num_tests, int *num_fail) TEST(vma_flags_diff); TEST(vma_flags_and); TEST(append_vma_flags); + TEST(vma_flags_count); } From 3a6455d56bd7c4cfb1ea35ddae052943065e338e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:34 +0000 Subject: [PATCH 325/369] mm: convert do_brk_flags() to use vma_flags_t In order to be able to do this, we need to change VM_DATA_DEFAULT_FLAGS and friends and update the architecture-specific definitions also. We then have to update some KSM logic to handle VMA flags, and introduce VMA_STACK_FLAGS to define the vma_flags_t equivalent of VM_STACK_FLAGS. We also introduce two helper functions for use during the time we are converting legacy flags to vma_flags_t values - vma_flags_to_legacy() and legacy_to_vma_flags(). This enables us to iteratively make changes to break these changes up into separate parts. We use these explicitly here to keep VM_STACK_FLAGS around for certain users which need to maintain the legacy vm_flags_t values for the time being. We are no longer able to rely on the simple VM_xxx being set to zero if the feature is not enabled, so in the case of VM_DROPPABLE we introduce VMA_DROPPABLE as the vma_flags_t equivalent, which is set to EMPTY_VMA_FLAGS if the droppable flag is not available. While we're here, we make the description of do_brk_flags() into a kdoc comment, as it almost was already. We use vma_flags_to_legacy() to not need to update the vm_get_page_prot() logic as this time. Note that in create_init_stack_vma() we have to replace the BUILD_BUG_ON() with a VM_WARN_ON_ONCE() as the tested values are no longer build time available. We also update mprotect_fixup() to use VMA flags where possible, though we have to live with a little duplication between vm_flags_t and vma_flags_t values for the time being until further conversions are made. While we're here, update VM_SPECIAL to be defined in terms of VMA_SPECIAL_FLAGS now we have vma_flags_to_legacy(). Finally, we update the VMA tests to reflect these changes. Link: https://lkml.kernel.org/r/d02e3e45d9a33d7904b149f5604904089fd640ae.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Paul Moore [SELinux] Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- arch/arc/include/asm/page.h | 2 +- arch/arm/include/asm/page.h | 2 +- arch/arm64/include/asm/page.h | 7 ++++- arch/hexagon/include/asm/page.h | 2 +- arch/loongarch/include/asm/page.h | 2 +- arch/mips/include/asm/page.h | 2 +- arch/nios2/include/asm/page.h | 2 +- arch/powerpc/include/asm/page.h | 4 +-- arch/powerpc/include/asm/page_32.h | 2 +- arch/powerpc/include/asm/page_64.h | 12 ++++---- arch/riscv/include/asm/page.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/x86/include/asm/page_types.h | 2 +- arch/x86/um/asm/vm-flags.h | 4 +-- include/linux/ksm.h | 10 +++---- include/linux/mm.h | 47 ++++++++++++++++++------------ mm/internal.h | 3 ++ mm/ksm.c | 43 ++++++++++++++------------- mm/mmap.c | 13 +++++---- mm/mprotect.c | 46 +++++++++++++++++------------ mm/mremap.c | 6 ++-- mm/vma.c | 34 +++++++++++---------- mm/vma.h | 14 +++++++-- mm/vma_exec.c | 5 ++-- security/selinux/hooks.c | 4 ++- tools/testing/vma/include/custom.h | 3 -- tools/testing/vma/include/dup.h | 40 +++++++++++++------------ tools/testing/vma/include/stubs.h | 9 +++--- tools/testing/vma/tests/merge.c | 3 +- 29 files changed, 189 insertions(+), 138 deletions(-) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 38214e126c6d..facc7a03b250 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -131,7 +131,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) /* Default Permissions for stack/heaps pages (Non Executable) */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #define WANT_PAGE_VIRTUAL 1 diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index ef11b721230e..fa4c1225dde5 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -184,7 +184,7 @@ extern int pfn_valid(unsigned long); #include -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #include #include diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index b39cc1127e1f..e25d0d18f6d7 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -46,7 +46,12 @@ int pfn_is_map_memory(unsigned long pfn); #endif /* !__ASSEMBLER__ */ -#define VM_DATA_DEFAULT_FLAGS (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED) +#ifdef CONFIG_ARM64_MTE +#define VMA_DATA_DEFAULT_FLAGS append_vma_flags(VMA_DATA_FLAGS_TSK_EXEC, \ + VMA_MTE_ALLOWED_BIT) +#else +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC +#endif #include diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index f0aed3ed812b..6d82572a7f21 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -90,7 +90,7 @@ struct page; #define virt_to_page(kaddr) pfn_to_page(PFN_DOWN(__pa(kaddr))) /* Default vm area behavior is non-executable. */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 327bf0bc92bf..79235f4fc399 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -104,7 +104,7 @@ struct page *tlb_virt_to_page(unsigned long kaddr); extern int __virt_addr_valid(volatile void *kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((volatile void *)(kaddr)) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #include #include diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 5ec428fcc887..50a382a0d8f6 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -213,7 +213,7 @@ extern bool __virt_addr_valid(const volatile void *kaddr); #define virt_addr_valid(kaddr) \ __virt_addr_valid((const volatile void *) (kaddr)) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC extern unsigned long __kaslr_offset; static inline unsigned long kaslr_offset(void) diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 722956ac0bf8..71eb7c1b67d4 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -85,7 +85,7 @@ extern struct page *mem_map; # define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr))) # define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr))) -# define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +# define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #include diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index f2bb1f98eebe..281f25e071a3 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -240,8 +240,8 @@ static inline const void *pfn_to_kaddr(unsigned long pfn) * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32 VM_DATA_FLAGS_TSK_EXEC -#define VM_DATA_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS32 VMA_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS64 VMA_DATA_FLAGS_NON_EXEC #ifdef __powerpc64__ #include diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index 25482405a811..1fd8c21f0a42 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -10,7 +10,7 @@ #endif #endif -#define VM_DATA_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS32 +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS32 #if defined(CONFIG_PPC_256K_PAGES) || \ (defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)) diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 0f564a06bf68..d96c984d023b 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -84,9 +84,9 @@ extern u64 ppc64_pft_size; #endif /* __ASSEMBLER__ */ -#define VM_DATA_DEFAULT_FLAGS \ +#define VMA_DATA_DEFAULT_FLAGS \ (is_32bit_task() ? \ - VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64) + VMA_DATA_DEFAULT_FLAGS32 : VMA_DATA_DEFAULT_FLAGS64) /* * This is the default if a program doesn't have a PT_GNU_STACK @@ -94,12 +94,12 @@ extern u64 ppc64_pft_size; * stack by default, so in the absence of a PT_GNU_STACK program header * we turn execute permission off. */ -#define VM_STACK_DEFAULT_FLAGS32 VM_DATA_FLAGS_EXEC -#define VM_STACK_DEFAULT_FLAGS64 VM_DATA_FLAGS_NON_EXEC +#define VMA_STACK_DEFAULT_FLAGS32 VMA_DATA_FLAGS_EXEC +#define VMA_STACK_DEFAULT_FLAGS64 VMA_DATA_FLAGS_NON_EXEC -#define VM_STACK_DEFAULT_FLAGS \ +#define VMA_STACK_DEFAULT_FLAGS \ (is_32bit_task() ? \ - VM_STACK_DEFAULT_FLAGS32 : VM_STACK_DEFAULT_FLAGS64) + VMA_STACK_DEFAULT_FLAGS32 : VMA_STACK_DEFAULT_FLAGS64) #include diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 187aad0a7b03..c78017061b17 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -204,7 +204,7 @@ static __always_inline void *pfn_to_kaddr(unsigned long pfn) (unsigned long)(_addr) >= PAGE_OFFSET && pfn_valid(virt_to_pfn(_addr)); \ }) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #include #include diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index f339258135f7..56da819a79e6 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -277,7 +277,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr)))) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_NON_EXEC #endif /* !__ASSEMBLER__ */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 018a8d906ca3..3e0801a0f782 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -26,7 +26,7 @@ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1) diff --git a/arch/x86/um/asm/vm-flags.h b/arch/x86/um/asm/vm-flags.h index df7a3896f5dd..622d36d6ddff 100644 --- a/arch/x86/um/asm/vm-flags.h +++ b/arch/x86/um/asm/vm-flags.h @@ -9,11 +9,11 @@ #ifdef CONFIG_X86_32 -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_TSK_EXEC #else -#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_DATA_FLAGS_EXEC) +#define VMA_STACK_DEFAULT_FLAGS append_vma_flags(VMA_DATA_FLAGS_EXEC, VMA_GROWSDOWN_BIT) #endif #endif diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c982694c987b..d39d0d5483a2 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,8 +17,8 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags); +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); @@ -103,10 +103,10 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) +static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm, + const struct file *file, vma_flags_t vma_flags) { - return vm_flags; + return vma_flags; } static inline int ksm_disable(struct mm_struct *mm) diff --git a/include/linux/mm.h b/include/linux/mm.h index de72382efac2..4042a584671e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -346,9 +346,9 @@ enum { * if KVM does not lock down the memory type. */ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), -#ifdef CONFIG_PPC32 +#if defined(CONFIG_PPC32) DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), -#else +#elif defined(CONFIG_64BIT) DECLARE_VMA_BIT(DROPPABLE, 40), #endif DECLARE_VMA_BIT(UFFD_MINOR, 41), @@ -503,31 +503,42 @@ enum { #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) #define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#define VMA_DROPPABLE mk_vma_flags(VMA_DROPPABLE_BIT) #else #define VM_DROPPABLE VM_NONE +#define VMA_DROPPABLE EMPTY_VMA_FLAGS #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) +#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ + VMA_EXEC_BIT : VMA_READ_BIT) /* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC #endif -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS #endif +#define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ + VMA_STACK_BIT, VMA_ACCOUNT_BIT) + +/* Temporary until VMA flags conversion complete. */ +#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) + #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS @@ -536,8 +547,6 @@ enum { #define VM_SEALED_SYSMAP VM_NONE #endif -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) - /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) @@ -545,7 +554,10 @@ enum { /* * Special vmas that are non-mergable, non-mlock()able. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) +#define VM_SPECIAL vma_flags_to_legacy(VMA_SPECIAL_FLAGS) /* * Physically remapped pages are special. Tell the @@ -1407,7 +1419,7 @@ static __always_inline void vma_desc_set_flags_mask(struct vm_area_desc *desc, * vm_area_desc object describing a proposed VMA, e.g.: * * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, - * VMA_DONTDUMP_BIT); + * VMA_DONTDUMP_BIT); */ #define vma_desc_set_flags(desc, ...) \ vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) @@ -4045,7 +4057,6 @@ extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); -extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, diff --git a/mm/internal.h b/mm/internal.h index 3d3fa35e5fd1..ce954bab8a37 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1916,4 +1916,7 @@ static inline int get_sysctl_max_map_count(void) return READ_ONCE(sysctl_max_map_count); } +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/ksm.c b/mm/ksm.c index 2a2f2f005fc3..7d5b76478f0b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -735,21 +735,24 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } -static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) +static bool ksm_compatible(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & (VM_SHARED | VM_MAYSHARE | VM_SPECIAL | - VM_HUGETLB | VM_DROPPABLE)) - return false; /* just ignore the advice */ - + /* Just ignore the advice. */ + if (vma_flags_test_any(&vma_flags, VMA_SHARED_BIT, VMA_MAYSHARE_BIT, + VMA_HUGETLB_BIT)) + return false; + if (vma_flags_test_single_mask(&vma_flags, VMA_DROPPABLE)) + return false; + if (vma_flags_test_any_mask(&vma_flags, VMA_SPECIAL_FLAGS)) + return false; if (file_is_dax(file)) return false; - #ifdef VM_SAO - if (vm_flags & VM_SAO) + if (vma_flags_test(&vma_flags, VMA_SAO_BIT)) return false; #endif #ifdef VM_SPARC_ADI - if (vm_flags & VM_SPARC_ADI) + if (vma_flags_test(&vma_flags, VMA_SPARC_ADI_BIT)) return false; #endif @@ -758,7 +761,7 @@ static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags) static bool vma_ksm_compatible(struct vm_area_struct *vma) { - return ksm_compatible(vma->vm_file, vma->vm_flags); + return ksm_compatible(vma->vm_file, vma->flags); } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, @@ -2825,17 +2828,17 @@ static int ksm_scan_thread(void *nothing) return 0; } -static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags) +static bool __ksm_should_add_vma(const struct file *file, vma_flags_t vma_flags) { - if (vm_flags & VM_MERGEABLE) + if (vma_flags_test(&vma_flags, VMA_MERGEABLE_BIT)) return false; - return ksm_compatible(file, vm_flags); + return ksm_compatible(file, vma_flags); } static void __ksm_add_vma(struct vm_area_struct *vma) { - if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags)) + if (__ksm_should_add_vma(vma->vm_file, vma->flags)) vm_flags_set(vma, VM_MERGEABLE); } @@ -2860,16 +2863,16 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * @mm: Proposed VMA's mm_struct * @file: Proposed VMA's file-backed mapping, if any. - * @vm_flags: Proposed VMA"s flags. + * @vma_flags: Proposed VMA"s flags. * - * Returns: @vm_flags possibly updated to mark mergeable. + * Returns: @vma_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, - vm_flags_t vm_flags) +vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, + vma_flags_t vma_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) { - vm_flags |= VM_MERGEABLE; + __ksm_should_add_vma(file, vma_flags)) { + vma_flags_set(&vma_flags, VMA_MERGEABLE_BIT); /* * Generally, the flags here always include MMF_VM_MERGEABLE. * However, in rare cases, this flag may be cleared by ksmd who @@ -2879,7 +2882,7 @@ vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, __ksm_enter(mm); } - return vm_flags; + return vma_flags; } static void ksm_add_vmas(struct mm_struct *mm) diff --git a/mm/mmap.c b/mm/mmap.c index 2d2b814978bf..5754d1c36462 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -192,7 +192,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, + EMPTY_VMA_FLAGS) < 0) goto out; mm->brk = brk; @@ -1203,7 +1204,8 @@ out: int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) { - const vm_flags_t vm_flags = is_exec ? VM_EXEC : 0; + const vma_flags_t vma_flags = is_exec ? + mk_vma_flags(VMA_EXEC_BIT) : EMPTY_VMA_FLAGS; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; @@ -1230,7 +1232,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, bool is_exec) goto munmap_failed; vma = vma_prev(&vmi); - ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); + ret = do_brk_flags(&vmi, vma, addr, len, vma_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -1328,12 +1330,13 @@ destroy: * Return true if the calling process may expand its vm space by the passed * number of pages */ -bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) +bool may_expand_vm(struct mm_struct *mm, const vma_flags_t *vma_flags, + unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; - if (is_data_mapping(flags) && + if (is_data_mapping_vma_flags(vma_flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && diff --git a/mm/mprotect.c b/mm/mprotect.c index 9681f055b9fc..eaa724b99908 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -697,7 +697,8 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - vm_flags_t oldflags = READ_ONCE(vma->vm_flags); + const vma_flags_t old_vma_flags = READ_ONCE(vma->flags); + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -706,7 +707,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, if (vma_is_sealed(vma)) return -EPERM; - if (newflags == oldflags) { + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags)) { *pprev = vma; return 0; } @@ -717,8 +718,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * uncommon case, so doesn't need to be very optimized. */ if (arch_has_pfn_modify_check() && - (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) && - (newflags & VM_ACCESS_FLAGS) == 0) { + vma_flags_test_any(&old_vma_flags, VMA_PFNMAP_BIT, + VMA_MIXEDMAP_BIT) && + !vma_flags_test_any_mask(&new_vma_flags, VMA_ACCESS_FLAGS)) { pgprot_t new_pgprot = vm_get_page_prot(newflags); error = walk_page_range(current->mm, start, end, @@ -736,28 +738,31 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * hugetlb mapping were accounted for even if read-only so there is * no need to account for them here. */ - if (newflags & VM_WRITE) { + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { /* Check space limits when area turns into data. */ - if (!may_expand_vm(mm, newflags, nrpages) && - may_expand_vm(mm, oldflags, nrpages)) + if (!may_expand_vm(mm, &new_vma_flags, nrpages) && + may_expand_vm(mm, &old_vma_flags, nrpages)) return -ENOMEM; - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| - VM_SHARED|VM_NORESERVE))) { + if (!vma_flags_test_any(&old_vma_flags, + VMA_ACCOUNT_BIT, VMA_WRITE_BIT, VMA_HUGETLB_BIT, + VMA_SHARED_BIT, VMA_NORESERVE_BIT)) { charged = nrpages; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; - newflags |= VM_ACCOUNT; + vma_flags_set(&new_vma_flags, VMA_ACCOUNT_BIT); } - } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) && - !vma->anon_vma) { - newflags &= ~VM_ACCOUNT; + } else if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + vma_is_anonymous(vma) && !vma->anon_vma) { + vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT); } + newflags = vma_flags_to_legacy(new_vma_flags); vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; } + new_vma_flags = legacy_to_vma_flags(newflags); *pprev = vma; @@ -773,19 +778,24 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, change_protection(tlb, vma, start, end, mm_cp_flags); - if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT)) + if (vma_flags_test(&old_vma_flags, VMA_ACCOUNT_BIT) && + !vma_flags_test(&new_vma_flags, VMA_ACCOUNT_BIT)) vm_unacct_memory(nrpages); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. */ - if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && - (newflags & VM_WRITE)) { - populate_vma_page_range(vma, start, end, NULL); + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { + const vma_flags_t mask = + vma_flags_and(&old_vma_flags, VMA_WRITE_BIT, + VMA_SHARED_BIT, VMA_LOCKED_BIT); + + if (vma_flags_same(&mask, VMA_LOCKED_BIT)) + populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index 36b3f1caebad..e9c8b1d05832 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1472,10 +1472,10 @@ static unsigned long mremap_to(struct vma_remap_struct *vrm) /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { - vm_flags_t vm_flags = vrm->vma->vm_flags; + vma_flags_t vma_flags = vrm->vma->flags; unsigned long pages = vrm->old_len >> PAGE_SHIFT; - if (!may_expand_vm(mm, vm_flags, pages)) + if (!may_expand_vm(mm, &vma_flags, pages)) return -ENOMEM; } @@ -1813,7 +1813,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta)) return -EAGAIN; - if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) + if (!may_expand_vm(mm, &vma->flags, vrm->delta >> PAGE_SHIFT)) return -ENOMEM; return 0; diff --git a/mm/vma.c b/mm/vma.c index 6af26619e020..9362860389ae 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2385,7 +2385,7 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, static void update_ksm_flags(struct mmap_state *map) { - map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); + map->vma_flags = ksm_vma_flags(map->mm, map->file, map->vma_flags); } static void set_desc_from_map(struct vm_area_desc *desc, @@ -2446,7 +2446,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } /* Check against address space limit. */ - if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) + if (!may_expand_vm(map->mm, &map->vma_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ @@ -2866,20 +2866,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return ret; } -/* +/** * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, - * @vm_flags: The VMA Flags + * @vma_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. + * + * Returns: %0 on success, or otherwise an error. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, vm_flags_t vm_flags) + unsigned long addr, unsigned long len, vma_flags_t vma_flags) { struct mm_struct *mm = current->mm; @@ -2887,9 +2889,12 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ - vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - vm_flags = ksm_vma_flags(mm, NULL, vm_flags); - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) + vma_flags_set_mask(&vma_flags, VMA_DATA_DEFAULT_FLAGS); + vma_flags_set(&vma_flags, VMA_ACCOUNT_BIT); + vma_flags_set_mask(&vma_flags, mm->def_vma_flags); + + vma_flags = ksm_vma_flags(mm, NULL, vma_flags); + if (!may_expand_vm(mm, &vma_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > get_sysctl_max_map_count()) @@ -2903,7 +2908,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); + VMG_STATE(vmg, mm, vmi, addr, addr + len, vma_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ @@ -2924,8 +2929,8 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, vm_flags); - vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->flags = vma_flags; + vma->vm_page_prot = vm_get_page_prot(vma_flags_to_legacy(vma_flags)); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2936,10 +2941,10 @@ out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; - if (vm_flags & VM_LOCKED) + if (vma_flags_test(&vma_flags, VMA_LOCKED_BIT)) mm->locked_vm += (len >> PAGE_SHIFT); if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); return 0; mas_store_fail: @@ -3070,7 +3075,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long new_start; /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) + if (!may_expand_vm(mm, &vma->flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -3289,7 +3294,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; diff --git a/mm/vma.h b/mm/vma.h index cf8926558bf6..1f2de6cb3b97 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -237,13 +237,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } -#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ +#define VMG_STATE(name, mm_, vmi_, start_, end_, vma_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } @@ -465,7 +465,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + unsigned long addr, unsigned long request, + vma_flags_t vma_flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); @@ -527,6 +528,13 @@ static inline bool is_data_mapping(vm_flags_t flags) return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } +static inline bool is_data_mapping_vma_flags(const vma_flags_t *vma_flags) +{ + const vma_flags_t mask = vma_flags_and(vma_flags, + VMA_WRITE_BIT, VMA_SHARED_BIT, VMA_STACK_BIT); + + return vma_flags_same(&mask, VMA_WRITE_BIT); +} static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) diff --git a/mm/vma_exec.c b/mm/vma_exec.c index 8134e1afca68..5cee8b7efa0f 100644 --- a/mm/vma_exec.c +++ b/mm/vma_exec.c @@ -36,7 +36,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, EMPTY_VMA_FLAGS, + vma->vm_pgoff); struct vm_area_struct *next; struct mmu_gather tlb; PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); @@ -135,7 +136,7 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + VM_WARN_ON_ONCE(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; if (pgtable_supports_soft_dirty()) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d8224ea113d1..903303e084c2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7713,6 +7713,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { static __init int selinux_init(void) { + vma_flags_t data_default_flags = VMA_DATA_DEFAULT_FLAGS; + pr_info("SELinux: Initializing.\n"); memset(&selinux_state, 0, sizeof(selinux_state)); @@ -7729,7 +7731,7 @@ static __init int selinux_init(void) AUDIT_CFG_LSM_SECCTX_SUBJECT | AUDIT_CFG_LSM_SECCTX_OBJECT); - default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); + default_noexec = !vma_flags_test(&data_default_flags, VMA_EXEC_BIT); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h index b7d9eb0a44e4..744fe874c168 100644 --- a/tools/testing/vma/include/custom.h +++ b/tools/testing/vma/include/custom.h @@ -95,6 +95,3 @@ static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } - -#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ - VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 244ee02dc21d..36373b81ad24 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -314,28 +314,34 @@ enum { /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) +#define TASK_EXEC_BIT ((current->personality & READ_IMPLIES_EXEC) ? \ + VM_EXEC_BIT : VM_READ_BIT) /* Common data flag combinations */ -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ - VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VMA_DATA_FLAGS_TSK_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + TASK_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_NON_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, VMA_MAYEXEC_BIT) +#define VMA_DATA_FLAGS_EXEC mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, \ + VMA_EXEC_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT, \ + VMA_MAYEXEC_BIT) -#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#ifndef VMA_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VMA_DATA_DEFAULT_FLAGS VMA_DATA_FLAGS_EXEC #endif -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#ifndef VMA_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VMA_STACK_DEFAULT_FLAGS VMA_DATA_DEFAULT_FLAGS #endif +#define VMA_STACK_FLAGS append_vma_flags(VMA_STACK_DEFAULT_FLAGS, \ + VMA_STACK_BIT, VMA_ACCOUNT_BIT) +/* Temporary until VMA flags conversion complete. */ +#define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) + #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) - /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VMA_ACCESS_FLAGS mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT) @@ -345,6 +351,9 @@ enum { */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +#define VMA_SPECIAL_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_DONTEXPAND_BIT, \ + VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT) + #define VMA_REMAP_FLAGS mk_vma_flags(VMA_IO_BIT, VMA_PFNMAP_BIT, \ VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT) @@ -357,11 +366,6 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) -#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) - -#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 416bb93f5005..b5dced3b0bd4 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -101,10 +101,10 @@ static inline bool shmem_file(struct file *file) return false; } -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, - const struct file *file, vm_flags_t vm_flags) +static inline vma_flags_t ksm_vma_flags(struct mm_struct *mm, + const struct file *file, vma_flags_t vma_flags) { - return vm_flags; + return vma_flags; } static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) @@ -239,7 +239,8 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) return 0; } -static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, +static inline bool may_expand_vm(struct mm_struct *mm, + const vma_flags_t *vma_flags, unsigned long npages) { return true; diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c index d3e725dc0000..44e3977e3fc0 100644 --- a/tools/testing/vma/tests/merge.c +++ b/tools/testing/vma/tests/merge.c @@ -1429,11 +1429,10 @@ static bool test_expand_only_mode(void) { vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); - vm_flags_t legacy_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma_prev, *vma; - VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, legacy_flags, 5); + VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vma_flags, 5); /* * Place a VMA prior to the one we're expanding so we assert that we do From 7eb19a87d9c0e9b18fb069fa03ba365291922242 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:35 +0000 Subject: [PATCH 326/369] mm: update vma_supports_mlock() to use new VMA flags We now have the ability to test all of this using the new vma_flags_t approach, so let's do so. Link: https://lkml.kernel.org/r/49cc166dbafe0a81abc4581a9f5c84630b02fcb8.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- mm/internal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index ce954bab8a37..9c690f8635da 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1298,7 +1298,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, static inline bool vma_supports_mlock(const struct vm_area_struct *vma) { - if (vma->vm_flags & (VM_SPECIAL | VM_DROPPABLE)) + if (vma_test_any_mask(vma, VMA_SPECIAL_FLAGS)) + return false; + if (vma_test_single_mask(vma, VMA_DROPPABLE)) return false; if (vma_is_dax(vma) || is_vm_hugetlb_page(vma)) return false; From d720b81d01b137dfc23e07461b05b76f822af6ab Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:36 +0000 Subject: [PATCH 327/369] mm/vma: introduce vma_clear_flags[_mask]() Introduce a helper function and helper macro to easily clear a VMA's flags using the new vma_flags_t vma->flags field: * vma_clear_flags_mask() - Clears all of the flags in a specified mask in the VMA's flags field. * vma_clear_flags() - Clears all of the specified individual VMA flag bits in a VMA's flags field. Also update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/9bd15da35c2c90e7441265adf01b5c2d3b5c6d41.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++++ tools/testing/vma/include/dup.h | 9 +++++++++ 2 files changed, 25 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4042a584671e..6b614f8af045 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1363,6 +1363,22 @@ static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +/* Helper to clear all VMA flags in a VMA. */ +static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_clear_mask(&vma->flags, flags); +} + +/* + * Helper macro for clearing VMA flags, e.g.: + * + * vma_clear_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT, + * VMA_DONTDUMP_BIT); + */ +#define vma_clear_flags(vma, ...) \ + vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + /* * Test whether a specific VMA flag is set in a VMA descriptor, e.g.: * diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 36373b81ad24..93ea600d0895 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1065,6 +1065,15 @@ static __always_inline void vma_set_flags_mask(struct vm_area_struct *vma, #define vma_set_flags(vma, ...) \ vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) +static __always_inline void vma_clear_flags_mask(struct vm_area_struct *vma, + vma_flags_t flags) +{ + vma_flags_clear_mask(&vma->flags, flags); +} + +#define vma_clear_flags(vma, ...) \ + vma_clear_flags_mask(vma, mk_vma_flags(__VA_ARGS__)) + static __always_inline bool vma_desc_test(const struct vm_area_desc *desc, vma_flag_t bit) { From a6f14fb59337ec3b507856430e0636b1b10980ec Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:37 +0000 Subject: [PATCH 328/369] tools/testing/vma: update VMA tests to test vma_clear_flags[_mask]() The tests have existing flag clearing logic, so simply expand this to use the new VMA-specific flag clearing helpers. Also correct some trivial formatting issue in a macro define. Link: https://lkml.kernel.org/r/f5da681d3c33039dd4a838188385796eb8d58373.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/testing/vma/tests/vma.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c index c73c3a565f1d..754a2da06321 100644 --- a/tools/testing/vma/tests/vma.c +++ b/tools/testing/vma/tests/vma.c @@ -347,19 +347,20 @@ static bool test_vma_flags_clear(void) , 64 #endif ); - struct vm_area_struct vma; - struct vm_area_desc desc; - - vma.flags = flags; - desc.vma_flags = flags; + struct vm_area_struct vma = { + .flags = flags, + }; + struct vm_area_desc desc = { + .vma_flags = flags, + }; /* Cursory check of _mask() variant, as the helper macros imply. */ vma_flags_clear_mask(&flags, mask); - vma_flags_clear_mask(&vma.flags, mask); + vma_clear_flags_mask(&vma, mask); vma_desc_clear_flags_mask(&desc, mask); #if NUM_VMA_FLAG_BITS > 64 ASSERT_FALSE(vma_flags_test_any(&flags, VMA_EXEC_BIT, 64)); - ASSERT_FALSE(vma_flags_test_any(&vma.flags, VMA_EXEC_BIT, 64)); + ASSERT_FALSE(vma_test_any(&vma, VMA_EXEC_BIT, 64)); ASSERT_FALSE(vma_desc_test_any(&desc, VMA_EXEC_BIT, 64)); /* Reset. */ vma_flags_set(&flags, VMA_EXEC_BIT, 64); @@ -371,15 +372,15 @@ static bool test_vma_flags_clear(void) * Clear the flags and assert clear worked, then reset flags back to * include specified flags. */ -#define do_test_and_reset(...) \ - vma_flags_clear(&flags, __VA_ARGS__); \ - vma_flags_clear(&vma.flags, __VA_ARGS__); \ - vma_desc_clear_flags(&desc, __VA_ARGS__); \ - ASSERT_FALSE(vma_flags_test_any(&flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_flags_test_any(&vma.flags, __VA_ARGS__)); \ - ASSERT_FALSE(vma_desc_test_any(&desc, __VA_ARGS__)); \ - vma_flags_set(&flags, __VA_ARGS__); \ - vma_set_flags(&vma, __VA_ARGS__); \ +#define do_test_and_reset(...) \ + vma_flags_clear(&flags, __VA_ARGS__); \ + vma_clear_flags(&vma, __VA_ARGS__); \ + vma_desc_clear_flags(&desc, __VA_ARGS__); \ + ASSERT_FALSE(vma_flags_test_any(&flags, __VA_ARGS__)); \ + ASSERT_FALSE(vma_test_any(&vma, __VA_ARGS__)); \ + ASSERT_FALSE(vma_desc_test_any(&desc, __VA_ARGS__)); \ + vma_flags_set(&flags, __VA_ARGS__); \ + vma_set_flags(&vma, __VA_ARGS__); \ vma_desc_set_flags(&desc, __VA_ARGS__) /* Single flags. */ From 769669bd9ca4cbae2562d57fe753efdcf17a196d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:38 +0000 Subject: [PATCH 329/369] mm/vma: convert as much as we can in mm/vma.c to vma_flags_t Now we have established a good foundation for vm_flags_t to vma_flags_t changes, update mm/vma.c to utilise vma_flags_t wherever possible. We are able to convert VM_STARTGAP_FLAGS entirely as this is only used in mm/vma.c, and to account for the fact we can't use VM_NONE to make life easier, place the definition of this within existing #ifdef's to be cleaner. Generally the remaining changes are mechanical. Also update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/5fdeaf8af9a12c2a5d68497495f52fa627d05a5b.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++- mm/vma.c | 89 +++++++++++++++++-------------- tools/testing/vma/include/dup.h | 4 ++ tools/testing/vma/include/stubs.h | 2 +- 4 files changed, 59 insertions(+), 42 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b614f8af045..c6b40dc88918 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -463,8 +463,10 @@ enum { #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) || \ defined(CONFIG_RISCV_USER_CFI) #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) #else #define VM_SHADOW_STACK VM_NONE +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) #endif #if defined(CONFIG_PPC64) #define VM_SAO INIT_VM_FLAG(SAO) @@ -539,8 +541,6 @@ enum { /* Temporary until VMA flags conversion complete. */ #define VM_STACK_FLAGS vma_flags_to_legacy(VMA_STACK_FLAGS) -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS #define VM_SEALED_SYSMAP VM_SEALED #else @@ -584,6 +584,8 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) + /* These flags can be updated atomically via VMA/mmap read lock. */ #define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD diff --git a/mm/vma.c b/mm/vma.c index 9362860389ae..9d194f8e7acb 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -185,7 +185,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -211,7 +211,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg) } /* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * Return true if we can merge this (vma_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) @@ -850,7 +850,8 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ - if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!left_side && !right_side)) return NULL; if (left_side) @@ -1072,7 +1073,8 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ - if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) + if (vma_flags_test_any_mask(&vmg->vma_flags, VMA_SPECIAL_FLAGS) || + (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); @@ -1459,17 +1461,17 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, nrpages = vma_pages(next); vms->nr_pages += nrpages; - if (next->vm_flags & VM_LOCKED) + if (vma_test(next, VMA_LOCKED_BIT)) vms->locked_vm += nrpages; - if (next->vm_flags & VM_ACCOUNT) + if (vma_test(next, VMA_ACCOUNT_BIT)) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; - else if (is_data_mapping(next->vm_flags)) + else if (is_data_mapping_vma_flags(&next->flags)) vms->data_vm += nrpages; if (vms->uf) { @@ -2065,14 +2067,13 @@ static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) static bool vma_is_shared_writable(struct vm_area_struct *vma) { - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == - (VM_WRITE | VM_SHARED); + return vma_test_all(vma, VMA_WRITE_BIT, VMA_SHARED_BIT); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ - if (vma->vm_flags & VM_PFNMAP) + if (vma_test(vma, VMA_PFNMAP_BIT)) return false; return vma->vm_file && vma->vm_file->f_mapping && @@ -2338,8 +2339,11 @@ void mm_drop_all_locks(struct mm_struct *mm) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) +static bool accountable_mapping(struct mmap_state *map) { + const struct file *file = map->file; + vma_flags_t mask; + /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. @@ -2347,7 +2351,9 @@ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) if (file && is_file_hugepages(file)) return false; - return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; + mask = vma_flags_and(&map->vma_flags, VMA_NORESERVE_BIT, VMA_SHARED_BIT, + VMA_WRITE_BIT); + return vma_flags_same(&mask, VMA_WRITE_BIT); } /* @@ -2450,7 +2456,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, return -ENOMEM; /* Private writable mapping: check memory availability. */ - if (accountable_mapping(map->file, map->vm_flags)) { + if (accountable_mapping(map)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { @@ -2460,7 +2466,7 @@ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, } vms->nr_accounted = 0; - map->vm_flags |= VM_ACCOUNT; + vma_flags_set(&map->vma_flags, VMA_ACCOUNT_BIT); } /* @@ -2508,12 +2514,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * Drivers should not permit writability when previously it was * disallowed. */ - VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && - !(map->vm_flags & VM_MAYWRITE) && - (vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_flags_same_pair(&map->vma_flags, &vma->flags) && + !vma_flags_test(&map->vma_flags, VMA_MAYWRITE_BIT) && + vma_test(vma, VMA_MAYWRITE_BIT)); map->file = vma->vm_file; - map->vm_flags = vma->vm_flags; + map->vma_flags = vma->flags; return 0; } @@ -2544,7 +2550,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { @@ -2554,7 +2560,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (map->file) error = __mmap_new_file_vma(map, vma); - else if (map->vm_flags & VM_SHARED) + else if (vma_flags_test(&map->vma_flags, VMA_SHARED_BIT)) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); @@ -2564,7 +2570,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (!map->check_ksm_early) { update_ksm_flags(map); - vm_flags_init(vma, map->vm_flags); + vma->flags = map->vma_flags; } #ifdef CONFIG_SPARC64 @@ -2604,7 +2610,6 @@ free_vma: static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; - vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); @@ -2612,9 +2617,9 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); - if (vm_flags & VM_LOCKED) { + if (vma_test(vma, VMA_LOCKED_BIT)) { if (!vma_supports_mlock(vma)) - vm_flags_clear(vma, VM_LOCKED_MASK); + vma_clear_flags_mask(vma, VMA_LOCKED_MASK); else mm->locked_vm += map->pglen; } @@ -2630,7 +2635,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * a completely new data area). */ if (pgtable_supports_soft_dirty()) - vm_flags_set(vma, VM_SOFTDIRTY); + vma_set_flags(vma, VMA_SOFTDIRTY_BIT); vma_set_page_prot(vma); } @@ -2993,7 +2998,8 @@ retry: gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); @@ -3045,7 +3051,8 @@ retry: gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + /* Avoid prev check if possible */ + if (tmp && vma_test_any_mask(tmp, VMA_STARTGAP_FLAGS)) { if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); @@ -3083,12 +3090,16 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT)) + if (!mlock_future_ok(mm, vma_test(vma, VMA_LOCKED_BIT), + grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; + new_start = vma->vm_end - size; +#ifdef CONFIG_STACK_GROWSUP + if (vma_test(vma, VMA_GROWSUP_BIT)) + new_start = vma->vm_start; +#endif if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; @@ -3102,7 +3113,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return 0; } -#if defined(CONFIG_STACK_GROWSUP) +#ifdef CONFIG_STACK_GROWSUP /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. @@ -3115,7 +3126,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSUP)) + if (!vma_test(vma, VMA_GROWSUP_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3135,7 +3146,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) + if (!vma_test(next, VMA_GROWSUP_BIT)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } @@ -3169,7 +3180,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3200,7 +3211,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); - if (!(vma->vm_flags & VM_GROWSDOWN)) + if (!vma_test(vma, VMA_GROWSDOWN_BIT)) return -EFAULT; mmap_assert_write_locked(mm); @@ -3213,7 +3224,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && + if (!vma_test(prev, VMA_GROWSDOWN_BIT) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; @@ -3248,7 +3259,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - if (vma->vm_flags & VM_LOCKED) + if (vma_test(vma, VMA_LOCKED_BIT)) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); @@ -3297,7 +3308,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; - if ((vma->vm_flags & VM_ACCOUNT) && + if (vma_test(vma, VMA_ACCOUNT_BIT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; @@ -3319,7 +3330,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) } if (vma_link(mm, vma)) { - if (vma->vm_flags & VM_ACCOUNT) + if (vma_test(vma, VMA_ACCOUNT_BIT)) vm_unacct_memory(charged); return -ENOMEM; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 93ea600d0895..58a621ec389f 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -267,8 +267,10 @@ enum { #endif /* CONFIG_ARCH_HAS_PKEYS */ #if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) #define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT, VMA_SHADOW_STACK_BIT) #else #define VM_SHADOW_STACK VM_NONE +#define VMA_STARTGAP_FLAGS mk_vma_flags(VMA_GROWSDOWN_BIT) #endif #if defined(CONFIG_PPC64) #define VM_SAO INIT_VM_FLAG(SAO) @@ -366,6 +368,8 @@ enum { /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +#define VMA_LOCKED_MASK mk_vma_flags(VMA_LOCKED_BIT, VMA_LOCKONFAULT_BIT) + #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index b5dced3b0bd4..5afb0afe2d48 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -229,7 +229,7 @@ static inline bool signal_pending(void *p) return false; } -static inline bool is_file_hugepages(struct file *file) +static inline bool is_file_hugepages(const struct file *file) { return false; } From e2963f639fde9f71a759bdfee02697a610ae4819 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:39 +0000 Subject: [PATCH 330/369] tools: bitmap: add missing bitmap_copy() implementation I need this for changes I am making to keep the VMA tests running correctly. Link: https://lkml.kernel.org/r/4dcb2fb959137e9fe58a23e21cebcea97de41a1f.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka (SUSE) Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- tools/include/linux/bitmap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 845eda759f67..5cb4f3942fd3 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -55,6 +55,17 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); } +static __always_inline +void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) +{ + unsigned int len = bitmap_size(nbits); + + if (small_const_nbits(nbits)) + *dst = *src; + else + memcpy(dst, src, len); +} + static inline bool bitmap_empty(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) From a06eb2f8279e0b2b42799d42041f144377f5a086 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:40 +0000 Subject: [PATCH 331/369] mm/vma: convert vma_modify_flags[_uffd]() to use vma_flags_t Update the vma_modify_flags() and vma_modify_flags_uffd() functions to accept a vma_flags_t parameter rather than a vm_flags_t one, and propagate the changes as needed to implement this change. Also add vma_flags_reset_once() in replacement of vm_flags_reset_once(). We still need to be careful here because we need to avoid tearing, so maintain the assumption that the first system word set of flags are the only ones that require protection from tearing, and retain this functionality. We can copy the remainder of VMA flags above 64 bits normally. But hopefully by the time that happens, we will have replaced the logic that requires these WRITE_ONCE()'s with something else. We also replace instances of vm_flags_reset() with a simple write of VMA flags. We are no longer perform a number of checks, most notable of all the VMA flags asserts becase: 1. We might be operating on a VMA that is not yet added to the tree. 2. We might be operating on a VMA that is now detached. 3. Really in all but core code, you should be using vma_desc_xxx(). 4. Other VMA fields are manipulated with no such checks. 5. It'd be egregious to have to add variants of flag functions just to account for cases such as the above, especially when we don't do so for other VMA fields. Drivers are the problematic cases and why it was especially important (and also for debug as VMA locks were introduced), the mmap_prepare work is solving this generally. Additionally, we can fairly safely assume by this point the soft dirty flags are being set correctly, so it's reasonable to drop this also. Finally, update the VMA tests to reflect this. Link: https://lkml.kernel.org/r/51afbb2b8c3681003cc7926647e37335d793836e.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 +++++++++---------- include/linux/userfaultfd_k.h | 3 +++ mm/madvise.c | 10 +++++---- mm/mlock.c | 38 ++++++++++++++++++--------------- mm/mprotect.c | 7 +++--- mm/mseal.c | 11 ++++++---- mm/userfaultfd.c | 21 ++++++++++++------ mm/vma.c | 15 +++++++------ mm/vma.h | 15 ++++++------- tools/testing/vma/include/dup.h | 22 +++++++++++-------- tools/testing/vma/tests/merge.c | 3 +-- 11 files changed, 93 insertions(+), 74 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c6b40dc88918..72bc5016094b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -954,22 +954,20 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) +static inline void vma_flags_reset_once(struct vm_area_struct *vma, + vma_flags_t *flags) { - vma_assert_write_locked(vma); - /* - * If VMA flags exist beyond the first system word, also clear these. It - * is assumed the write once behaviour is required only for the first - * system word. - */ + const unsigned long word = flags->__vma_flags[0]; + + /* It is assumed only the first system word must be written once. */ + vma_flags_overwrite_word_once(&vma->flags, word); + /* The remainder can be copied normally. */ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { - unsigned long *bitmap = vma->flags.__vma_flags; + unsigned long *dst = &vma->flags.__vma_flags[1]; + const unsigned long *src = &flags->__vma_flags[1]; - bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); } - - vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index fd5f42765497..d83e349900a3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -23,6 +23,9 @@ /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) +#define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \ + VMA_UFFD_MINOR_BIT) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want diff --git a/mm/madvise.c b/mm/madvise.c index afe0f01765c4..69708e953cf5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -151,13 +151,15 @@ static int madvise_update_vma(vm_flags_t new_flags, struct madvise_behavior *madv_behavior) { struct vm_area_struct *vma = madv_behavior->vma; + vma_flags_t new_vma_flags = legacy_to_vma_flags(new_flags); struct madvise_behavior_range *range = &madv_behavior->range; struct anon_vma_name *anon_name = madv_behavior->anon_name; bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME; VMA_ITERATOR(vmi, madv_behavior->mm, range->start); - if (new_flags == vma->vm_flags && (!set_new_anon_name || - anon_vma_name_eq(anon_vma_name(vma), anon_name))) + if (vma_flags_same_mask(&vma->flags, new_vma_flags) && + (!set_new_anon_name || + anon_vma_name_eq(anon_vma_name(vma), anon_name))) return 0; if (set_new_anon_name) @@ -165,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags, range->start, range->end, anon_name); else vma = vma_modify_flags(&vmi, madv_behavior->prev, vma, - range->start, range->end, &new_flags); + range->start, range->end, &new_vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -174,7 +176,7 @@ static int madvise_update_vma(vm_flags_t new_flags, /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); - vm_flags_reset(vma, new_flags); + vma->flags = new_vma_flags; if (set_new_anon_name) return replace_anon_vma_name(vma, anon_name); diff --git a/mm/mlock.c b/mm/mlock.c index fd648138bc72..fdbd1434a35f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -415,13 +415,14 @@ out: * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma - * @newflags - the new set of flags for @vma. + * @new_vma_flags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t newflags) + unsigned long start, unsigned long end, + vma_flags_t *new_vma_flags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, @@ -439,18 +440,18 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ - if (newflags & VM_LOCKED) - newflags |= VM_IO; + if (vma_flags_test(new_vma_flags, VMA_LOCKED_BIT)) + vma_flags_set(new_vma_flags, VMA_IO_BIT); vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, new_vma_flags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); - if (newflags & VM_IO) { - newflags &= ~VM_IO; - vm_flags_reset_once(vma, newflags); + if (vma_flags_test(new_vma_flags, VMA_IO_BIT)) { + vma_flags_clear(new_vma_flags, VMA_IO_BIT); + vma_flags_reset_once(vma, new_vma_flags); } } @@ -467,20 +468,22 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { + vma_flags_t new_vma_flags = legacy_to_vma_flags(newflags); + const vma_flags_t old_vma_flags = vma->flags; struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; - vm_flags_t oldflags = vma->vm_flags; - if (newflags == oldflags || vma_is_secretmem(vma) || - !vma_supports_mlock(vma)) + if (vma_flags_same_pair(&old_vma_flags, &new_vma_flags) || + vma_is_secretmem(vma) || !vma_supports_mlock(vma)) { /* * Don't set VM_LOCKED or VM_LOCKONFAULT and don't count. * For secretmem, don't allow the memory to be unlocked. */ goto out; + } - vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *prev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -490,9 +493,9 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!(newflags & VM_LOCKED)) + if (!vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT)) nr_pages = -nr_pages; - else if (oldflags & VM_LOCKED) + else if (vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) nr_pages = 0; mm->locked_vm += nr_pages; @@ -501,12 +504,13 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + if (vma_flags_test(&new_vma_flags, VMA_LOCKED_BIT) && + vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); - vm_flags_reset(vma, newflags); + vma->flags = new_vma_flags; } else { - mlock_vma_pages_range(vma, start, end, newflags); + mlock_vma_pages_range(vma, start, end, &new_vma_flags); } out: *prev = vma; diff --git a/mm/mprotect.c b/mm/mprotect.c index eaa724b99908..941f1211da0d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -756,13 +756,11 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, vma_flags_clear(&new_vma_flags, VMA_ACCOUNT_BIT); } - newflags = vma_flags_to_legacy(new_vma_flags); - vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags); + vma = vma_modify_flags(vmi, *pprev, vma, start, end, &new_vma_flags); if (IS_ERR(vma)) { error = PTR_ERR(vma); goto fail; } - new_vma_flags = legacy_to_vma_flags(newflags); *pprev = vma; @@ -771,7 +769,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * held in write mode. */ vma_start_write(vma); - vm_flags_reset_once(vma, newflags); + vma_flags_reset_once(vma, &new_vma_flags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); @@ -796,6 +794,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, } vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); + newflags = vma_flags_to_legacy(new_vma_flags); vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mseal.c b/mm/mseal.c index ac58643181f7..e2093ae3d25c 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -68,14 +68,17 @@ static int mseal_apply(struct mm_struct *mm, const unsigned long curr_start = MAX(vma->vm_start, start); const unsigned long curr_end = MIN(vma->vm_end, end); - if (!(vma->vm_flags & VM_SEALED)) { - vm_flags_t vm_flags = vma->vm_flags | VM_SEALED; + if (!vma_test(vma, VMA_SEALED_BIT)) { + vma_flags_t vma_flags = vma->flags; + + vma_flags_set(&vma_flags, VMA_SEALED_BIT); vma = vma_modify_flags(&vmi, prev, vma, curr_start, - curr_end, &vm_flags); + curr_end, &vma_flags); if (IS_ERR(vma)) return PTR_ERR(vma); - vm_flags_set(vma, VM_SEALED); + vma_start_write(vma); + vma_set_flags(vma, VMA_SEALED_BIT); } prev = vma; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 2c565c7134b6..89879c3ba344 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1976,6 +1976,9 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, { struct vm_area_struct *ret; bool give_up_on_oom = false; + vma_flags_t new_vma_flags = vma->flags; + + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); /* * If we are modifying only and not splitting, just give up on the merge @@ -1989,8 +1992,8 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, uffd_wp_range(vma, start, end - start, false); ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, - vma->vm_flags & ~__VM_UFFD_FLAGS, - NULL_VM_UFFD_CTX, give_up_on_oom); + &new_vma_flags, NULL_VM_UFFD_CTX, + give_up_on_oom); /* * In the vma_merge() successful mprotect-like case 8: @@ -2010,10 +2013,11 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long end, bool wp_async) { + vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; - vm_flags_t new_flags; + vma_flags_t new_vma_flags; if (vma->vm_start < start) prev = vma; @@ -2024,23 +2028,26 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); - VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (vma->vm_userfaultfd_ctx.ctx == ctx && - (vma->vm_flags & vm_flags) == vm_flags) + vma_test_all_mask(vma, vma_flags)) goto skip; if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + new_vma_flags = vma->flags; + vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); + vma_flags_set_mask(&new_vma_flags, vma_flags); + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, - new_flags, + &new_vma_flags, (struct vm_userfaultfd_ctx){ctx}, /* give_up_on_oom = */false); if (IS_ERR(vma)) diff --git a/mm/vma.c b/mm/vma.c index 9d194f8e7acb..16a1d708c978 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1710,13 +1710,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr) + vma_flags_t *vma_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - const vm_flags_t vm_flags = *vm_flags_ptr; + const vma_flags_t vma_flags = *vma_flags_ptr; struct vm_area_struct *ret; - vmg.vm_flags = vm_flags; + vmg.vma_flags = vma_flags; ret = vma_modify(&vmg); if (IS_ERR(ret)) @@ -1728,7 +1728,7 @@ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) - *vm_flags_ptr = ret->vm_flags; + *vma_flags_ptr = ret->flags; return ret; } @@ -1758,12 +1758,13 @@ struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, - struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) + unsigned long start, unsigned long end, + const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, + bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.vm_flags = vm_flags; + vmg.vma_flags = *vma_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; diff --git a/mm/vma.h b/mm/vma.h index 1f2de6cb3b97..270008e5babc 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -342,24 +342,23 @@ void unmap_region(struct unmap_desc *unmap); * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is + * @vma_flags_ptr: A pointer to the VMA flags that the @start to @end range is * about to be set to. On merge, this will be updated to include sticky flags. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * - * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points + * In order to account for sticky VMA flags, the @vma_flags_ptr parameter points * to the requested flags which are then updated so the caller, should they * overwrite any existing flags, correctly retains these. * * Returns: A VMA which contains the range @start to @end ready to have its - * flags altered to *@vm_flags. + * flags altered to *@vma_flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - vm_flags_t *vm_flags_ptr); + unsigned long start, unsigned long end, vma_flags_t *vma_flags_ptr); /** * vma_modify_name() - Perform any necessary split/merge in preparation for @@ -418,7 +417,7 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. - * @vm_flags: The VMA flags that the @start to @end range is about to be set to. + * @vma_flags: The VMA flags that the @start to @end range is about to be set to. * @new_ctx: The userfaultfd context that the @start to @end range is about to * be set to. * @give_up_on_oom: If an out of memory condition occurs on merge, simply give @@ -429,11 +428,11 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, * and the caller is expected to perform the actual modification. * * Returns: A VMA which contains the range @start to @end ready to have its VMA - * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx. + * flags changed to @vma_flags and its userfaultfd context changed to @new_ctx. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, - unsigned long start, unsigned long end, vm_flags_t vm_flags, + unsigned long start, unsigned long end, const vma_flags_t *vma_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 58a621ec389f..9dd57f50ea6d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -871,16 +871,20 @@ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_init(vma, flags); } -static inline void vm_flags_reset_once(struct vm_area_struct *vma, - vm_flags_t flags) +static inline void vma_flags_reset_once(struct vm_area_struct *vma, + vma_flags_t *flags) { - vma_assert_write_locked(vma); - /* - * The user should only be interested in avoiding reordering of - * assignment to the first word. - */ - vma_flags_clear_all(&vma->flags); - vma_flags_overwrite_word_once(&vma->flags, flags); + const unsigned long word = flags->__vma_flags[0]; + + /* It is assumed only the first system word must be written once. */ + vma_flags_overwrite_word_once(&vma->flags, word); + /* The remainder can be copied normally. */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *dst = &vma->flags.__vma_flags[1]; + const unsigned long *src = &flags->__vma_flags[1]; + + bitmap_copy(dst, src, NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } } static inline void vm_flags_set(struct vm_area_struct *vma, diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c index 44e3977e3fc0..03b6f9820e0a 100644 --- a/tools/testing/vma/tests/merge.c +++ b/tools/testing/vma/tests/merge.c @@ -132,7 +132,6 @@ static bool test_simple_modify(void) struct vm_area_struct *vma; vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); - vm_flags_t legacy_flags = VM_READ | VM_WRITE; struct mm_struct mm = {}; struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vma_flags); VMA_ITERATOR(vmi, &mm, 0x1000); @@ -144,7 +143,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, &legacy_flags); + 0x1000, 0x2000, &vma_flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); From 90cb921c4d7bf92854344d3e76561f48784c613e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:41 +0000 Subject: [PATCH 332/369] mm/vma: convert __mmap_region() to use vma_flags_t Update the mmap() implementation logic implemented in __mmap_region() and functions invoked by it. The mmap_region() function converts its input vm_flags_t parameter to a vma_flags_t value which it then passes to __mmap_region() which uses the vma_flags_t value consistently from then on. As part of the change, we convert map_deny_write_exec() to using vma_flags_t (it was incorrectly using unsigned long before), and place it in vma.h, as it is only used internal to mm. With this change, we eliminate the legacy is_shared_maywrite_vm_flags() helper function which is now no longer required. We are also able to update the MMAP_STATE() and VMG_MMAP_STATE() macros to use the vma_flags_t value. Finally, we update the VMA tests to reflect the change. Link: https://lkml.kernel.org/r/1fc33a404c962f02da778da100387cc19bd62153.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- include/linux/mm.h | 18 ++++++++---- include/linux/mman.h | 49 ------------------------------- mm/mprotect.c | 4 ++- mm/vma.c | 25 ++++++++-------- mm/vma.h | 51 +++++++++++++++++++++++++++++++++ tools/testing/vma/include/dup.h | 34 +++++----------------- tools/testing/vma/tests/mmap.c | 18 ++++-------- 7 files changed, 92 insertions(+), 107 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 72bc5016094b..9472b3c9a22b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1522,12 +1522,6 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); @@ -4335,12 +4329,24 @@ static inline bool range_in_vma(const struct vm_area_struct *vma, #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(vm_flags_t vm_flags); + +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); + + return vm_get_page_prot(vm_flags); +} + void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) { return __pgprot(0); } +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + return __pgprot(0); +} static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/include/linux/mman.h b/include/linux/mman.h index 0ba8a7e8b90a..389521594c69 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -170,53 +170,4 @@ static inline bool arch_memory_deny_write_exec_supported(void) } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif - -/* - * Denies creating a writable executable mapping or gaining executable permissions. - * - * This denies the following: - * - * a) mmap(PROT_WRITE | PROT_EXEC) - * - * b) mmap(PROT_WRITE) - * mprotect(PROT_EXEC) - * - * c) mmap(PROT_WRITE) - * mprotect(PROT_READ) - * mprotect(PROT_EXEC) - * - * But allows the following: - * - * d) mmap(PROT_READ | PROT_EXEC) - * mmap(PROT_READ | PROT_EXEC | PROT_BTI) - * - * This is only applicable if the user has set the Memory-Deny-Write-Execute - * (MDWE) protection mask for the current process. - * - * @old specifies the VMA flags the VMA originally possessed, and @new the ones - * we propose to set. - * - * Return: false if proposed change is OK, true if not ok and should be denied. - */ -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - #endif /* _LINUX_MMAN_H */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 941f1211da0d..007d9a72b2f0 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -882,6 +882,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { vm_flags_t mask_off_old_flags; + vma_flags_t new_vma_flags; vm_flags_t newflags; int new_vma_pkey; @@ -904,6 +905,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); newflags |= (vma->vm_flags & ~mask_off_old_flags); + new_vma_flags = legacy_to_vma_flags(newflags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { @@ -911,7 +913,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - if (map_deny_write_exec(vma->vm_flags, newflags)) { + if (map_deny_write_exec(&vma->flags, &new_vma_flags)) { error = -EACCES; break; } diff --git a/mm/vma.c b/mm/vma.c index 16a1d708c978..c335f989586f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -44,7 +44,7 @@ struct mmap_state { bool file_doesnt_need_get :1; }; -#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vma_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ @@ -52,9 +52,9 @@ struct mmap_state { .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ - .vm_flags = vm_flags_, \ + .vma_flags = vma_flags_, \ .file = file_, \ - .page_prot = vm_get_page_prot(vm_flags_), \ + .page_prot = vma_get_page_prot(vma_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -63,7 +63,7 @@ struct mmap_state { .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ - .vm_flags = (map_)->vm_flags, \ + .vma_flags = (map_)->vma_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ @@ -2746,14 +2746,14 @@ static int call_action_complete(struct mmap_state *map, } static unsigned long __mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vma_flags_t vma_flags, + unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); - MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vma_flags, file); struct vm_area_desc desc = { .mm = mm, .file = file, @@ -2837,16 +2837,17 @@ abort_munmap: * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; + const vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ - if (map_deny_write_exec(vm_flags, vm_flags)) + if (map_deny_write_exec(&vma_flags, &vma_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ @@ -2854,7 +2855,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite_vm_flags(vm_flags)) { + if (file && is_shared_maywrite(&vma_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) @@ -2862,7 +2863,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, writable_file_mapping = true; } - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + ret = __mmap_region(file, addr, len, vma_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) diff --git a/mm/vma.h b/mm/vma.h index 270008e5babc..adc18f7dd9f1 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -704,4 +704,55 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif +#ifdef CONFIG_MMU +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. + */ +static inline bool map_deny_write_exec(const vma_flags_t *old, + const vma_flags_t *new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!vma_flags_test(new, VMA_EXEC_BIT)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (vma_flags_test(new, VMA_WRITE_BIT)) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!vma_flags_test(old, VMA_EXEC_BIT)) + return true; + + return false; +} +#endif + #endif /* __MM_VMA_H */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 9dd57f50ea6d..ab92358b082c 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1124,12 +1124,6 @@ static __always_inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc, #define vma_desc_clear_flags(desc, ...) \ vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__)) -static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags) -{ - return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == - (VM_SHARED | VM_MAYWRITE); -} - static inline bool is_shared_maywrite(const vma_flags_t *flags) { return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT); @@ -1446,27 +1440,6 @@ static inline bool mlock_future_ok(const struct mm_struct *mm, return locked_pages <= limit_pages; } -static inline bool map_deny_write_exec(unsigned long old, unsigned long new) -{ - /* If MDWE is disabled, we have nothing to deny. */ - if (mm_flags_test(MMF_HAS_MDWE, current->mm)) - return false; - - /* If the new VMA is not executable, we have nothing to deny. */ - if (!(new & VM_EXEC)) - return false; - - /* Under MDWE we do not accept newly writably executable VMAs... */ - if (new & VM_WRITE) - return true; - - /* ...nor previously non-executable VMAs becoming executable. */ - if (!(old & VM_EXEC)) - return true; - - return false; -} - static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? @@ -1518,3 +1491,10 @@ static inline int get_sysctl_max_map_count(void) #ifndef pgtable_supports_soft_dirty #define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) #endif + +static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags) +{ + const vm_flags_t vm_flags = vma_flags_to_legacy(vma_flags); + + return vm_get_page_prot(vm_flags); +} diff --git a/tools/testing/vma/tests/mmap.c b/tools/testing/vma/tests/mmap.c index bded4ecbe5db..c85bc000d1cb 100644 --- a/tools/testing/vma/tests/mmap.c +++ b/tools/testing/vma/tests/mmap.c @@ -2,6 +2,8 @@ static bool test_mmap_region_basic(void) { + const vma_flags_t vma_flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, + VMA_MAYREAD_BIT, VMA_MAYWRITE_BIT); struct mm_struct mm = {}; unsigned long addr; struct vm_area_struct *vma; @@ -10,27 +12,19 @@ static bool test_mmap_region_basic(void) current->mm = &mm; /* Map at 0x300000, length 0x3000. */ - addr = __mmap_region(NULL, 0x300000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x300, NULL); + addr = __mmap_region(NULL, 0x300000, 0x3000, vma_flags, 0x300, NULL); ASSERT_EQ(addr, 0x300000); /* Map at 0x250000, length 0x3000. */ - addr = __mmap_region(NULL, 0x250000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x250, NULL); + addr = __mmap_region(NULL, 0x250000, 0x3000, vma_flags, 0x250, NULL); ASSERT_EQ(addr, 0x250000); /* Map at 0x303000, merging to 0x300000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x303000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x303, NULL); + addr = __mmap_region(NULL, 0x303000, 0x3000, vma_flags, 0x303, NULL); ASSERT_EQ(addr, 0x303000); /* Map at 0x24d000, merging to 0x250000 of length 0x6000. */ - addr = __mmap_region(NULL, 0x24d000, 0x3000, - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE, - 0x24d, NULL); + addr = __mmap_region(NULL, 0x24d000, 0x3000, vma_flags, 0x24d, NULL); ASSERT_EQ(addr, 0x24d000); ASSERT_EQ(mm.map_count, 2); From 71fdcf9283536370235b5f0a0e3a79b715bdb078 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 19:38:42 +0000 Subject: [PATCH 333/369] mm: simplify VMA flag tests of excluded flags We have implemented flag mask comparisons of the form: if ((vm_flags & (VM_FOO|VM_BAR|VM_BAZ) == VM_FOO) { ... } Like-for-like in the code using a bitwise-and mask via vma_flags_and() and using vma_flags_same() to ensure the final result equals only the required flag value. This is fine but confusing, make things clearer by instead explicitly excluding undesired flags and including the desired one via tests of the form: if (vma_flags_test(&flags, VMA_FOO_BIT) && !vma_flags_test_any(&flags, VMA_BAR_BIT, VMA_BAZ_BIT)) { ... } Which makes it easier to understand what is going on. No functional change intended. Link: https://lkml.kernel.org/r/d395c5dd837a9864f5efcec42175910afbe3ce73.1774034900.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Suggested-by: Vlastimil Babka (SUSE) Acked-by: Vlastimil Babka (SUSE) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Al Viro Cc: Anton Ivanov Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: Dinh Nguyen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Johannes Berg Cc: Kees Cook Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Ondrej Mosnacek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pedro Falcato Cc: Richard Weinberger Cc: Russell King Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: xu xin Signed-off-by: Andrew Morton --- mm/mprotect.c | 12 ++++-------- mm/vma.c | 7 +++---- mm/vma.h | 6 ++---- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 007d9a72b2f0..110d47a36d4b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -784,14 +784,10 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. */ - if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT)) { - const vma_flags_t mask = - vma_flags_and(&old_vma_flags, VMA_WRITE_BIT, - VMA_SHARED_BIT, VMA_LOCKED_BIT); - - if (vma_flags_same(&mask, VMA_LOCKED_BIT)) - populate_vma_page_range(vma, start, end, NULL); - } + if (vma_flags_test(&new_vma_flags, VMA_WRITE_BIT) && + vma_flags_test(&old_vma_flags, VMA_LOCKED_BIT) && + !vma_flags_test_any(&old_vma_flags, VMA_WRITE_BIT, VMA_SHARED_BIT)) + populate_vma_page_range(vma, start, end, NULL); vm_stat_account(mm, vma_flags_to_legacy(old_vma_flags), -nrpages); newflags = vma_flags_to_legacy(new_vma_flags); diff --git a/mm/vma.c b/mm/vma.c index c335f989586f..a4b30a069153 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2343,7 +2343,6 @@ void mm_drop_all_locks(struct mm_struct *mm) static bool accountable_mapping(struct mmap_state *map) { const struct file *file = map->file; - vma_flags_t mask; /* * hugetlb has its own accounting separate from the core VM @@ -2352,9 +2351,9 @@ static bool accountable_mapping(struct mmap_state *map) if (file && is_file_hugepages(file)) return false; - mask = vma_flags_and(&map->vma_flags, VMA_NORESERVE_BIT, VMA_SHARED_BIT, - VMA_WRITE_BIT); - return vma_flags_same(&mask, VMA_WRITE_BIT); + return vma_flags_test(&map->vma_flags, VMA_WRITE_BIT) && + !vma_flags_test_any(&map->vma_flags, VMA_NORESERVE_BIT, + VMA_SHARED_BIT); } /* diff --git a/mm/vma.h b/mm/vma.h index adc18f7dd9f1..1bfe7e47f6be 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -529,10 +529,8 @@ static inline bool is_data_mapping(vm_flags_t flags) static inline bool is_data_mapping_vma_flags(const vma_flags_t *vma_flags) { - const vma_flags_t mask = vma_flags_and(vma_flags, - VMA_WRITE_BIT, VMA_SHARED_BIT, VMA_STACK_BIT); - - return vma_flags_same(&mask, VMA_WRITE_BIT); + return vma_flags_test(vma_flags, VMA_WRITE_BIT) && + !vma_flags_test_any(vma_flags, VMA_SHARED_BIT, VMA_STACK_BIT); } static inline void vma_iter_config(struct vma_iterator *vmi, From 3e4bb2706817710d9461394da8b75be79981586b Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:27 +0000 Subject: [PATCH 334/369] mm: various small mmap_prepare cleanups Patch series "mm: expand mmap_prepare functionality and usage", v4. This series expands the mmap_prepare functionality, which is intended to replace the deprecated f_op->mmap hook which has been the source of bugs and security issues for some time. This series starts with some cleanup of existing mmap_prepare logic, then adds documentation for the mmap_prepare call to make it easier for filesystem and driver writers to understand how it works. It then importantly adds a vm_ops->mapped hook, a key feature that was missing from mmap_prepare previously - this is invoked when a driver which specifies mmap_prepare has successfully been mapped but not merged with another VMA. mmap_prepare is invoked prior to a merge being attempted, so you cannot manipulate state such as reference counts as if it were a new mapping. The vm_ops->mapped hook allows a driver to perform tasks required at this stage, and provides symmetry against subsequent vm_ops->open,close calls. The series uses this to correct the afs implementation which wrongly manipulated reference count at mmap_prepare time. It then adds an mmap_prepare equivalent of vm_iomap_memory() - mmap_action_simple_ioremap(), then uses this to update a number of drivers. It then splits out the mmap_prepare compatibility layer (which allows for invocation of mmap_prepare hooks in an mmap() hook) in such a way as to allow for more incremental implementation of mmap_prepare hooks. It then uses this to extend mmap_prepare usage in drivers. Finally it adds an mmap_prepare equivalent of vm_map_pages(), which lays the foundation for future work which will extend mmap_prepare to DMA coherent mappings. This patch (of 21): Rather than passing arbitrary fields, pass a vm_area_desc pointer to mmap prepare functions to mmap prepare, and an action and vma pointer to mmap complete in order to put all the action-specific logic in the function actually doing the work. Additionally, allow mmap prepare functions to return an error so we can error out as soon as possible if there is something logically incorrect in the input. Update remap_pfn_range_prepare() to properly check the input range for the CoW case. Also remove io_remap_pfn_range_complete(), as we can simply set up the fields correctly in io_remap_pfn_range_prepare() and use remap_pfn_range_complete() for this. While we're here, make remap_pfn_range_prepare_vma() a little neater, and pass mmap_action directly to call_action_complete(). Then, update compat_vma_mmap() to perform its logic directly, as __compat_vma_map() is not used by anything so we don't need to export it. Also update compat_vma_mmap() to use vfs_mmap_prepare() rather than calling the mmap_prepare op directly. Finally, update the VMA userland tests to reflect the changes. Link: https://lkml.kernel.org/r/cover.1774045440.git.ljs@kernel.org Link: https://lkml.kernel.org/r/99f408e4694f44ab12bdc55fe0bd9685d3bd1117.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 - include/linux/mm.h | 7 +- mm/internal.h | 32 ++++---- mm/memory.c | 53 ++++++++----- mm/util.c | 121 +++++++++++++----------------- mm/vma.c | 24 +++--- tools/testing/vma/include/dup.h | 7 +- tools/testing/vma/include/stubs.h | 8 +- 8 files changed, 130 insertions(+), 124 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..a2628a12bd2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,8 +2058,6 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9472b3c9a22b..6ca2fc5ae83f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4304,10 +4304,9 @@ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma); +int mmap_action_prepare(struct vm_area_desc *desc); +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, diff --git a/mm/internal.h b/mm/internal.h index 9c690f8635da..4dddd89153d4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1839,26 +1839,28 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t pgprot); +int remap_pfn_range_prepare(struct vm_area_desc *desc); +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action); -static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc, - unsigned long orig_pfn, unsigned long size) +static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) { + struct mmap_action *action = &desc->action; + const unsigned long orig_pfn = action->remap.start_pfn; + const pgprot_t orig_pgprot = action->remap.pgprot; + const unsigned long size = action->remap.size; const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + int err; - return remap_pfn_range_prepare(desc, pfn); -} + action->remap.start_pfn = pfn; + action->remap.pgprot = pgprot_decrypted(orig_pgprot); + err = remap_pfn_range_prepare(desc); + if (err) + return err; -static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma, - unsigned long addr, unsigned long orig_pfn, unsigned long size, - pgprot_t orig_prot) -{ - const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); - const pgprot_t prot = pgprot_decrypted(orig_prot); - - return remap_pfn_range_complete(vma, addr, pfn, size, prot); + /* Remap does the actual work. */ + action->type = MMAP_REMAP_PFN; + return 0; } #ifdef CONFIG_MMU_NOTIFIER diff --git a/mm/memory.c b/mm/memory.c index 425e852a2eb7..10a61dd81f97 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3099,26 +3099,34 @@ static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } #endif -void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +int remap_pfn_range_prepare(struct vm_area_desc *desc) { - /* - * We set addr=VMA start, end=VMA end here, so this won't fail, but we - * check it again on complete and will fail there if specified addr is - * invalid. - */ - get_remap_pgoff(vma_desc_is_cow_mapping(desc), desc->start, desc->end, - desc->start, desc->end, pfn, &desc->pgoff); - vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS); -} - -static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size) -{ - unsigned long end = addr + PAGE_ALIGN(size); + const struct mmap_action *action = &desc->action; + const unsigned long start = action->remap.start; + const unsigned long end = start + action->remap.size; + const unsigned long pfn = action->remap.start_pfn; + const bool is_cow = vma_desc_is_cow_mapping(desc); int err; - err = get_remap_pgoff(is_cow_mapping(vma->vm_flags), addr, end, - vma->vm_start, vma->vm_end, pfn, &vma->vm_pgoff); + err = get_remap_pgoff(is_cow, start, end, desc->start, desc->end, pfn, + &desc->pgoff); + if (err) + return err; + + vma_desc_set_flags_mask(desc, VMA_REMAP_FLAGS); + return 0; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size) +{ + const unsigned long end = addr + PAGE_ALIGN(size); + const bool is_cow = is_cow_mapping(vma->vm_flags); + int err; + + err = get_remap_pgoff(is_cow, addr, end, vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); if (err) return err; @@ -3151,10 +3159,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); -int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_complete(struct vm_area_struct *vma, + struct mmap_action *action) { - return do_remap_pfn_range(vma, addr, pfn, size, prot); + const unsigned long start = action->remap.start; + const unsigned long pfn = action->remap.start_pfn; + const unsigned long size = action->remap.size; + const pgprot_t prot = action->remap.pgprot; + + return do_remap_pfn_range(vma, start, pfn, size, prot); } /** diff --git a/mm/util.c b/mm/util.c index ce7ae80047cf..73c97a748d8e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,43 +1163,6 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -/** - * __compat_vma_mmap() - See description for compat_vma_mmap() - * for details. This is the same operation, only with a specific file operations - * struct which may or may not be the same as vma->vm_file->f_op. - * @f_op: The file operations whose .mmap_prepare() hook is specified. - * @file: The file which backs or will back the mapping. - * @vma: The VMA to apply the .mmap_prepare() hook to. - * Returns: 0 on success or error. - */ -int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma) -{ - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; - int err; - - err = f_op->mmap_prepare(&desc); - if (err) - return err; - - mmap_action_prepare(&desc.action, &desc); - set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); -} -EXPORT_SYMBOL(__compat_vma_mmap); - /** * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an * existing VMA and execute any requested actions. @@ -1228,7 +1191,31 @@ EXPORT_SYMBOL(__compat_vma_mmap); */ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap(file->f_op, file, vma); + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vma_flags = vma->flags, + .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ + }; + int err; + + err = vfs_mmap_prepare(file, &desc); + if (err) + return err; + + err = mmap_action_prepare(&desc); + if (err) + return err; + + set_vma_from_desc(vma, &desc); + return mmap_action_complete(vma, &desc.action); } EXPORT_SYMBOL(compat_vma_mmap); @@ -1320,8 +1307,8 @@ again: } } -static int mmap_action_finish(struct mmap_action *action, - const struct vm_area_struct *vma, int err) +static int mmap_action_finish(struct vm_area_struct *vma, + struct mmap_action *action, int err) { /* * If an error occurs, unmap the VMA altogether and return an error. We @@ -1353,37 +1340,38 @@ static int mmap_action_finish(struct mmap_action *action, /** * mmap_action_prepare - Perform preparatory setup for an VMA descriptor * action which need to be performed. - * @desc: The VMA descriptor to prepare for @action. - * @action: The action to perform. + * @desc: The VMA descriptor to prepare for its @desc->action. + * + * Returns: %0 on success, otherwise error. */ -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: - break; + return 0; case MMAP_REMAP_PFN: - remap_pfn_range_prepare(desc, action->remap.start_pfn); - break; + return remap_pfn_range_prepare(desc); case MMAP_IO_REMAP_PFN: - io_remap_pfn_range_prepare(desc, action->remap.start_pfn, - action->remap.size); - break; + return io_remap_pfn_range_prepare(desc); } + + WARN_ON_ONCE(1); + return -EINVAL; } EXPORT_SYMBOL(mmap_action_prepare); /** * mmap_action_complete - Execute VMA descriptor action. - * @action: The action to perform. * @vma: The VMA to perform the action upon. + * @action: The action to perform. * * Similar to mmap_action_prepare(). * * Return: 0 on success, or error, at which point the VMA will be unmapped. */ -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) + { int err = 0; @@ -1391,25 +1379,22 @@ int mmap_action_complete(struct mmap_action *action, case MMAP_NOTHING: break; case MMAP_REMAP_PFN: - err = remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + err = remap_pfn_range_complete(vma, action); break; case MMAP_IO_REMAP_PFN: - err = io_remap_pfn_range_complete(vma, action->remap.start, - action->remap.start_pfn, action->remap.size, - action->remap.pgprot); + /* Should have been delegated. */ + WARN_ON_ONCE(1); + err = -EINVAL; break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #else -void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +int mmap_action_prepare(struct vm_area_desc *desc) { - switch (action->type) { + switch (desc->action.type) { case MMAP_NOTHING: break; case MMAP_REMAP_PFN: @@ -1417,11 +1402,13 @@ void mmap_action_prepare(struct mmap_action *action, WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } + + return 0; } EXPORT_SYMBOL(mmap_action_prepare); -int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { int err = 0; @@ -1436,7 +1423,7 @@ int mmap_action_complete(struct mmap_action *action, break; } - return mmap_action_finish(action, vma, err); + return mmap_action_finish(vma, action, err); } EXPORT_SYMBOL(mmap_action_complete); #endif diff --git a/mm/vma.c b/mm/vma.c index a4b30a069153..1e2996a12d7f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2640,15 +2640,18 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -static void call_action_prepare(struct mmap_state *map, - struct vm_area_desc *desc) +static int call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { - struct mmap_action *action = &desc->action; + int err; - mmap_action_prepare(action, desc); + err = mmap_action_prepare(desc); + if (err) + return err; - if (action->hide_from_rmap_until_complete) + if (desc->action.hide_from_rmap_until_complete) map->hold_file_rmap_lock = true; + return 0; } /* @@ -2672,7 +2675,9 @@ static int call_mmap_prepare(struct mmap_state *map, if (err) return err; - call_action_prepare(map, desc); + err = call_action_prepare(map, desc); + if (err) + return err; /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; @@ -2727,13 +2732,12 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) } static int call_action_complete(struct mmap_state *map, - struct vm_area_desc *desc, + struct mmap_action *action, struct vm_area_struct *vma) { - struct mmap_action *action = &desc->action; int ret; - ret = mmap_action_complete(action, vma); + ret = mmap_action_complete(vma, action); /* If we held the file rmap we need to release it. */ if (map->hold_file_rmap_lock) { @@ -2795,7 +2799,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = call_action_complete(&map, &desc, vma); + error = call_action_complete(&map, &desc.action, vma); if (error) return error; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ab92358b082c..e7581efaf470 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1277,9 +1277,12 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op, if (err) return err; - mmap_action_prepare(&desc.action, &desc); + err = mmap_action_prepare(&desc); + if (err) + return err; + set_vma_from_desc(vma, &desc); - return mmap_action_complete(&desc.action, vma); + return mmap_action_complete(vma, &desc.action); } static inline int compat_vma_mmap(struct file *file, diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index 5afb0afe2d48..a30b8bc84955 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -81,13 +81,13 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) { } -static inline void mmap_action_prepare(struct mmap_action *action, - struct vm_area_desc *desc) +static inline int mmap_action_prepare(struct vm_area_desc *desc) { + return 0; } -static inline int mmap_action_complete(struct mmap_action *action, - struct vm_area_struct *vma) +static inline int mmap_action_complete(struct vm_area_struct *vma, + struct mmap_action *action) { return 0; } From fdd247841380d86c9e38027cb519b6fc45930a83 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:28 +0000 Subject: [PATCH 335/369] mm: add documentation for the mmap_prepare file operation callback This documentation makes it easier for a driver/file system implementer to correctly use this callback. It covers the fundamentals, whilst intentionally leaving the less lovely possible actions one might take undocumented (for instance - the success_hook, error_hook fields in mmap_action). The document also covers the new VMA flags implementation which is the only one which will work correctly with mmap_prepare. Link: https://lkml.kernel.org/r/3aebf918c213fa2aecf00a31a444119b5bdd7801.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/mmap_prepare.rst | 142 +++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 Documentation/filesystems/mmap_prepare.rst diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index f4873197587d..6cbc3e0292ae 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -29,6 +29,7 @@ algorithms work. fiemap files locks + mmap_prepare multigrain-ts mount_api quota diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst new file mode 100644 index 000000000000..ae484d371861 --- /dev/null +++ b/Documentation/filesystems/mmap_prepare.rst @@ -0,0 +1,142 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +mmap_prepare callback HOWTO +=========================== + +Introduction +============ + +The ``struct file->f_op->mmap()`` callback has been deprecated as it is both a +stability and security risk, and doesn't always permit the merging of adjacent +mappings resulting in unnecessary memory fragmentation. + +It has been replaced with the ``file->f_op->mmap_prepare()`` callback which +solves these problems. + +This hook is called right at the beginning of setting up the mapping, and +importantly it is invoked *before* any merging of adjacent mappings has taken +place. + +If an error arises upon mapping, it might arise after this callback has been +invoked, therefore it should be treated as effectively stateless. + +That is - no resources should be allocated nor state updated to reflect that a +mapping has been established, as the mapping may either be merged, or fail to be +mapped after the callback is complete. + +How To Use +========== + +In your driver's struct file_operations struct, specify an ``mmap_prepare`` +callback rather than an ``mmap`` one, e.g. for ext4: + +.. code-block:: C + + const struct file_operations ext4_file_operations = { + ... + .mmap_prepare = ext4_file_mmap_prepare, + }; + +This has a signature of ``int (*mmap_prepare)(struct vm_area_desc *)``. + +Examining the struct vm_area_desc type: + +.. code-block:: C + + struct vm_area_desc { + /* Immutable state. */ + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ + unsigned long start; + unsigned long end; + + /* Mutable fields. Populated with initial state. */ + pgoff_t pgoff; + struct file *vm_file; + vma_flags_t vma_flags; + pgprot_t page_prot; + + /* Write-only fields. */ + const struct vm_operations_struct *vm_ops; + void *private_data; + + /* Take further action? */ + struct mmap_action action; + }; + +This is straightforward - you have all the fields you need to set up the +mapping, and you can update the mutable and writable fields, for instance: + +.. code-block:: C + + static int ext4_file_mmap_prepare(struct vm_area_desc *desc) + { + int ret; + struct file *file = desc->file; + struct inode *inode = file->f_mapping->host; + + ... + + file_accessed(file); + if (IS_DAX(file_inode(file))) { + desc->vm_ops = &ext4_dax_vm_ops; + vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); + } else { + desc->vm_ops = &ext4_file_vm_ops; + } + return 0; + } + +Importantly, you no longer have to dance around with reference counts or locks +when updating these fields - **you can simply go ahead and change them**. + +Everything is taken care of by the mapping code. + +VMA Flags +--------- + +Along with ``mmap_prepare``, VMA flags have undergone an overhaul. Where before +you would invoke one of vm_flags_init(), vm_flags_reset(), vm_flags_set(), +vm_flags_clear(), and vm_flags_mod() to modify flags (and to have the +locking done correctly for you, this is no longer necessary. + +Also, the legacy approach of specifying VMA flags via ``VM_READ``, ``VM_WRITE``, +etc. - i.e. using a ``-VM_xxx``- macro has changed too. + +When implementing mmap_prepare(), reference flags by their bit number, defined +as a ``VMA_xxx_BIT`` macro, e.g. ``VMA_READ_BIT``, ``VMA_WRITE_BIT`` etc., +and use one of (where ``desc`` is a pointer to struct vm_area_desc): + +* ``vma_desc_test_any(desc, ...)`` - Specify a comma-separated list of flags + you wish to test for (whether _any_ are set), e.g. - ``vma_desc_test_any( + desc, VMA_WRITE_BIT, VMA_MAYWRITE_BIT)`` - returns ``true`` if either are set, + otherwise ``false``. +* ``vma_desc_set_flags(desc, ...)`` - Update the VMA descriptor flags to set + additional flags specified by a comma-separated list, + e.g. - ``vma_desc_set_flags(desc, VMA_PFNMAP_BIT, VMA_IO_BIT)``. +* ``vma_desc_clear_flags(desc, ...)`` - Update the VMA descriptor flags to clear + flags specified by a comma-separated list, e.g. - ``vma_desc_clear_flags( + desc, VMA_WRITE_BIT, VMA_MAYWRITE_BIT)``. + +Actions +======= + +You can now very easily have actions be performed upon a mapping once set up by +utilising simple helper functions invoked upon the struct vm_area_desc +pointer. These are: + +* mmap_action_remap() - Remaps a range consisting only of PFNs for a specific + range starting a virtual address and PFN number of a set size. + +* mmap_action_remap_full() - Same as mmap_action_remap(), only remaps the + entire mapping from ``start_pfn`` onward. + +* mmap_action_ioremap() - Same as mmap_action_remap(), only performs an I/O + remap. + +* mmap_action_ioremap_full() - Same as mmap_action_ioremap(), only remaps + the entire mapping from ``start_pfn`` onward. + +**NOTE:** The ``action`` field should never normally be manipulated directly, +rather you ought to use one of these helpers. From 827e97cf4bf59e9a72bcec37944bcebb3139a457 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:29 +0000 Subject: [PATCH 336/369] mm: document vm_operations_struct->open the same as close() Describe when the operation is invoked and the context in which it is invoked, matching the description already added for vm_op->close(). While we're here, update all outdated references to an 'area' field for VMAs to the more consistent 'vma'. Link: https://lkml.kernel.org/r/7d0ca833c12014320f0fa00f816f95e6e10076f2.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 ++++++++++----- tools/testing/vma/include/dup.h | 15 ++++++++++----- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6ca2fc5ae83f..21a2eef5f8fe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -764,15 +764,20 @@ struct vm_fault { * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); + /** + * @open: Called when a VMA is remapped, split or forked. Not called + * upon first mapping a VMA. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*open)(struct vm_area_struct *vma); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ - void (*close)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct *vma); /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); + int (*may_split)(struct vm_area_struct *vma, unsigned long addr); + int (*mremap)(struct vm_area_struct *vma); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -784,7 +789,7 @@ struct vm_operations_struct { vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); + unsigned long (*pagesize)(struct vm_area_struct *vma); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index e7581efaf470..5bc04c801504 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -632,15 +632,20 @@ struct vm_area_struct { } __randomize_layout; struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); + /** + * @open: Called when a VMA is remapped, split or forked. Not called + * upon first mapping a VMA. + * Context: User context. May sleep. Caller holds mmap_lock. + */ + void (*open)(struct vm_area_struct *vma); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ - void (*close)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct *vma); /* Called any time before splitting to check if it's allowed */ - int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area); + int (*may_split)(struct vm_area_struct *vma, unsigned long addr); + int (*mremap)(struct vm_area_struct *vma); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not @@ -652,7 +657,7 @@ struct vm_operations_struct { vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); - unsigned long (*pagesize)(struct vm_area_struct * area); + unsigned long (*pagesize)(struct vm_area_struct *vma); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ From f96e1d5f15b7c854a6a9ec1225d68a12fe7dcda6 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:30 +0000 Subject: [PATCH 337/369] mm: avoid deadlock when holding rmap on mmap_prepare error Commit ac0a3fc9c07d ("mm: add ability to take further action in vm_area_desc") added the ability for drivers to instruct mm to take actions after the .mmap_prepare callback is complete. To make life simpler and safer, this is done before the VMA/mmap write lock is dropped but when the VMA is completely established. So on error, we simply munmap() the VMA. As part of this implementation, unfortunately a horrible hack had to be implemented to support some questionable behaviour hugetlb relies upon - that is that the file rmap lock is held until the operation is complete. The implementation, for convenience, did this in mmap_action_finish() so both the VMA and mmap_prepare compatibility layer paths would have this correctly handled. However, it turns out there is a mistake here - the rmap lock cannot be held on munmap, as free_pgtables() -> unlink_file_vma_batch_add() -> unlink_file_vma_batch_process() takes the file rmap lock. We therefore currently have a deadlock issue that might arise. Resolve this by leaving it to callers to handle the unmap. The compatibility layer does not support this rmap behaviour, so we simply have it unmap on error after calling mmap_action_complete(). In the VMA implementation, we only perform the unmap after the rmap lock is dropped. This resolves the issue by ensuring the rmap lock is always dropped when the unmap occurs. Link: https://lkml.kernel.org/r/d44248be9da68258b07c2c59d4e73485ee0ca943.1774045440.git.ljs@kernel.org Fixes: ac0a3fc9c07d ("mm: add ability to take further action in vm_area_desc") Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Vlastimil Babka (SUSE) Cc: Wei Liu Cc: Signed-off-by: Andrew Morton --- mm/util.c | 12 +++++++----- mm/vma.c | 13 ++++++++++--- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/util.c b/mm/util.c index 73c97a748d8e..a2cfa0d77c35 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1215,7 +1215,13 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) return err; set_vma_from_desc(vma, &desc); - return mmap_action_complete(vma, &desc.action); + err = mmap_action_complete(vma, &desc.action); + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + } + return err; } EXPORT_SYMBOL(compat_vma_mmap); @@ -1316,10 +1322,6 @@ static int mmap_action_finish(struct vm_area_struct *vma, * invoked if we do NOT merge, so we only clean up the VMA we created. */ if (err) { - const size_t len = vma_pages(vma) << PAGE_SHIFT; - - do_munmap(current->mm, vma->vm_start, len, NULL); - if (action->error_hook) { /* We may want to filter the error. */ err = action->error_hook(err); diff --git a/mm/vma.c b/mm/vma.c index 1e2996a12d7f..4095834dce09 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2735,9 +2735,9 @@ static int call_action_complete(struct mmap_state *map, struct mmap_action *action, struct vm_area_struct *vma) { - int ret; + int err; - ret = mmap_action_complete(vma, action); + err = mmap_action_complete(vma, action); /* If we held the file rmap we need to release it. */ if (map->hold_file_rmap_lock) { @@ -2745,7 +2745,14 @@ static int call_action_complete(struct mmap_state *map, i_mmap_unlock_write(file->f_mapping); } - return ret; + + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + } + + return err; } static unsigned long __mmap_region(struct file *file, unsigned long addr, From 33506d4bae9516beb9ad22ad3ce18c1a8e47aeb0 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:31 +0000 Subject: [PATCH 338/369] mm: switch the rmap lock held option off in compat layer In the mmap_prepare compatibility layer, we don't need to hold the rmap lock, as we are being called from an .mmap handler. The .mmap_prepare hook, when invoked in the VMA logic, is called prior to the VMA being instantiated, but the completion hook is called after the VMA is linked into the maple tree, meaning rmap walkers can reach it. The mmap hook does not link the VMA into the tree, so this cannot happen. Therefore it's safe to simply disable this in the mmap_prepare compatibility layer. Also update VMA tests code to reflect current compatibility layer state. [akpm@linux-foundation.org: fix comment typo, per Vlastimil] Link: https://lkml.kernel.org/r/dda74230d26a1fcd79a3efab61fa4101dd1cac64.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- mm/util.c | 6 ++++- tools/testing/vma/include/dup.h | 42 +++++++++++++++++---------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/mm/util.c b/mm/util.c index a2cfa0d77c35..54eab29adb56 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1204,6 +1204,7 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) .action.type = MMAP_NOTHING, /* Default */ }; + struct mmap_action *action = &desc.action; int err; err = vfs_mmap_prepare(file, &desc); @@ -1214,8 +1215,11 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) if (err) return err; + /* being invoked from .mmap means we don't have to enforce this. */ + action->hide_from_rmap_until_complete = false; + set_vma_from_desc(vma, &desc); - err = mmap_action_complete(vma, &desc.action); + err = mmap_action_complete(vma, action); if (err) { const size_t len = vma_pages(vma) << PAGE_SHIFT; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 5bc04c801504..64bb56980b9c 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1260,8 +1260,17 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap(const struct file_operations *f_op, - struct file *file, struct vm_area_struct *vma) +static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) +{ + return file->f_op->mmap_prepare(desc); +} + +static inline unsigned long vma_pages(struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, @@ -1276,9 +1285,10 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op, .action.type = MMAP_NOTHING, /* Default */ }; + struct mmap_action *action = &desc.action; int err; - err = f_op->mmap_prepare(&desc); + err = vfs_mmap_prepare(file, &desc); if (err) return err; @@ -1286,28 +1296,25 @@ static inline int __compat_vma_mmap(const struct file_operations *f_op, if (err) return err; + /* being invoked from .mmmap means we don't have to enforce this. */ + action->hide_from_rmap_until_complete = false; + set_vma_from_desc(vma, &desc); - return mmap_action_complete(vma, &desc.action); -} + err = mmap_action_complete(vma, action); + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; -static inline int compat_vma_mmap(struct file *file, - struct vm_area_struct *vma) -{ - return __compat_vma_mmap(file->f_op, file, vma); + do_munmap(current->mm, vma->vm_start, len, NULL); + } + return err; } - static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { mas_init(&vmi->mas, &mm->mm_mt, addr); } -static inline unsigned long vma_pages(struct vm_area_struct *vma) -{ - return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; -} - static inline void mmap_assert_locked(struct mm_struct *); static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, @@ -1477,11 +1484,6 @@ static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) return file->f_op->mmap(file, vma); } -static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) -{ - return file->f_op->mmap_prepare(desc); -} - static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) { /* Changing an anonymous vma with this is illegal */ From 04501e759e789288ba8359e04f83bcc00269611e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:32 +0000 Subject: [PATCH 339/369] mm/vma: remove superfluous map->hold_file_rmap_lock We don't need to reference this field, it's confusing as it duplicates mmap_action->hide_from_rmap_until_complete, so thread the mmap_action through to __mmap_new_vma() instead and use the same field consistently. Link: https://lkml.kernel.org/r/42c3fbb701e361a17193ecda0d2dabcc326288a5.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- mm/vma.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 4095834dce09..8ad24be1654e 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -38,8 +38,6 @@ struct mmap_state { /* Determine if we can check KSM flags early in mmap() logic. */ bool check_ksm_early :1; - /* If we map new, hold the file rmap lock on mapping. */ - bool hold_file_rmap_lock :1; /* If .mmap_prepare changed the file, we don't need to pin. */ bool file_doesnt_need_get :1; }; @@ -2530,10 +2528,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * * @map: Mapping state. * @vmap: Output pointer for the new VMA. + * @action: Any mmap_prepare action that is still to complete. * * Returns: Zero on success, or an error. */ -static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) +static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap, + struct mmap_action *action) { struct vma_iterator *vmi = map->vmi; int error = 0; @@ -2582,7 +2582,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; - vma_link_file(vma, map->hold_file_rmap_lock); + vma_link_file(vma, action->hide_from_rmap_until_complete); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below @@ -2649,8 +2649,6 @@ static int call_action_prepare(struct mmap_state *map, if (err) return err; - if (desc->action.hide_from_rmap_until_complete) - map->hold_file_rmap_lock = true; return 0; } @@ -2740,7 +2738,7 @@ static int call_action_complete(struct mmap_state *map, err = mmap_action_complete(vma, action); /* If we held the file rmap we need to release it. */ - if (map->hold_file_rmap_lock) { + if (action->hide_from_rmap_until_complete) { struct file *file = vma->vm_file; i_mmap_unlock_write(file->f_mapping); @@ -2794,7 +2792,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, /* ...but if we can't, allocate a new VMA. */ if (!vma) { - error = __mmap_new_vma(&map, &vma); + error = __mmap_new_vma(&map, &vma, &desc.action); if (error) goto unacct_error; allocated_new = true; From 382c0f2895d2f16252941fe739227e081eb50f1f Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:33 +0000 Subject: [PATCH 340/369] mm: have mmap_action_complete() handle the rmap lock and unmap Rather than have the callers handle this both the rmap lock release and unmapping the VMA on error, handle it within the mmap_action_complete() logic where it makes sense to, being careful not to unlock twice. This simplifies the logic and makes it harder to make mistake with this, while retaining correct behaviour with regard to avoiding deadlocks. Also replace the call_action_complete() function with a direct invocation of mmap_action_complete() as the abstraction is no longer required. Also update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/8d1ee8ebd3542d006a47e8382fb80cf5b57ecf10.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- mm/internal.h | 19 +++++++++++++++ mm/util.c | 41 +++++++++++++++------------------ mm/vma.c | 26 +-------------------- tools/testing/vma/include/dup.h | 8 +------ 4 files changed, 40 insertions(+), 54 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 4dddd89153d4..241510e21f4b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1863,6 +1863,25 @@ static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) return 0; } +/* + * When we succeed an mmap action or just before we unmap a VMA on error, we + * need to ensure any rmap lock held is released. On unmap it's required to + * avoid a deadlock. + */ +static inline void maybe_rmap_unlock_action(struct vm_area_struct *vma, + struct mmap_action *action) +{ + struct file *file; + + if (!action->hide_from_rmap_until_complete) + return; + + VM_WARN_ON_ONCE(vma_is_anonymous(vma)); + file = vma->vm_file; + i_mmap_unlock_write(file->f_mapping); + action->hide_from_rmap_until_complete = false; +} + #ifdef CONFIG_MMU_NOTIFIER static inline bool clear_flush_young_ptes_notify(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) diff --git a/mm/util.c b/mm/util.c index 54eab29adb56..e272efca8c0e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1219,13 +1219,7 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) action->hide_from_rmap_until_complete = false; set_vma_from_desc(vma, &desc); - err = mmap_action_complete(vma, action); - if (err) { - const size_t len = vma_pages(vma) << PAGE_SHIFT; - - do_munmap(current->mm, vma->vm_start, len, NULL); - } - return err; + return mmap_action_complete(vma, action); } EXPORT_SYMBOL(compat_vma_mmap); @@ -1320,26 +1314,30 @@ again: static int mmap_action_finish(struct vm_area_struct *vma, struct mmap_action *action, int err) { + size_t len; + + if (!err && action->success_hook) + err = action->success_hook(vma); + + /* do_munmap() might take rmap lock, so release if held. */ + maybe_rmap_unlock_action(vma, action); + if (!err) + return 0; + /* * If an error occurs, unmap the VMA altogether and return an error. We * only clear the newly allocated VMA, since this function is only * invoked if we do NOT merge, so we only clean up the VMA we created. */ - if (err) { - if (action->error_hook) { - /* We may want to filter the error. */ - err = action->error_hook(err); - - /* The caller should not clear the error. */ - VM_WARN_ON_ONCE(!err); - } - return err; + len = vma_pages(vma) << PAGE_SHIFT; + do_munmap(current->mm, vma->vm_start, len, NULL); + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); } - - if (action->success_hook) - return action->success_hook(vma); - - return 0; + return err; } #ifdef CONFIG_MMU @@ -1377,7 +1375,6 @@ EXPORT_SYMBOL(mmap_action_prepare); */ int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action) - { int err = 0; diff --git a/mm/vma.c b/mm/vma.c index 8ad24be1654e..e1950ae048e2 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2729,30 +2729,6 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) return false; } -static int call_action_complete(struct mmap_state *map, - struct mmap_action *action, - struct vm_area_struct *vma) -{ - int err; - - err = mmap_action_complete(vma, action); - - /* If we held the file rmap we need to release it. */ - if (action->hide_from_rmap_until_complete) { - struct file *file = vma->vm_file; - - i_mmap_unlock_write(file->f_mapping); - } - - if (err) { - const size_t len = vma_pages(vma) << PAGE_SHIFT; - - do_munmap(current->mm, vma->vm_start, len, NULL); - } - - return err; -} - static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vma_flags_t vma_flags, unsigned long pgoff, struct list_head *uf) @@ -2804,7 +2780,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = call_action_complete(&map, &desc.action, vma); + error = mmap_action_complete(vma, &desc.action); if (error) return error; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 64bb56980b9c..a95a4b07f68b 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1300,13 +1300,7 @@ static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) action->hide_from_rmap_until_complete = false; set_vma_from_desc(vma, &desc); - err = mmap_action_complete(vma, action); - if (err) { - const size_t len = vma_pages(vma) << PAGE_SHIFT; - - do_munmap(current->mm, vma->vm_start, len, NULL); - } - return err; + return mmap_action_complete(vma, action); } static inline void vma_iter_init(struct vma_iterator *vmi, From c50ca15dd4962bdf834945c2fa29b904042f366a Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:34 +0000 Subject: [PATCH 341/369] mm: add vm_ops->mapped hook Previously, when a driver needed to do something like establish a reference count, it could do so in the mmap hook in the knowledge that the mapping would succeed. With the introduction of f_op->mmap_prepare this is no longer the case, as it is invoked prior to actually establishing the mapping. mmap_prepare is not appropriate for this kind of thing as it is called before any merge might take place, and after which an error might occur meaning resources could be leaked. To take this into account, introduce a new vm_ops->mapped callback which is invoked when the VMA is first mapped (though notably - not when it is merged - which is correct and mirrors existing mmap/open/close behaviour). We do better that vm_ops->open() here, as this callback can return an error, at which point the VMA will be unmapped. Note that vm_ops->mapped() is invoked after any mmap action is complete (such as I/O remapping). We intentionally do not expose the VMA at this point, exposing only the fields that could be used, and an output parameter in case the operation needs to update the vma->vm_private_data field. In order to deal with stacked filesystems which invoke inner filesystem's mmap() invocations, add __compat_vma_mapped() and invoke it on vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is handled when an mmap() caller invokes a nested filesystem's mmap_prepare() callback. Update the mmap_prepare documentation to describe the mapped hook and make it clear what its intended use is. The vm_ops->mapped() call is handled by the mmap complete logic to ensure the same code paths are handled by both the compatibility and VMA layers. Additionally, update VMA userland test headers to reflect the change. Link: https://lkml.kernel.org/r/4c5e98297eb0aae9565c564e1c296a112702f144.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 15 ++++ include/linux/fs.h | 9 ++- include/linux/mm.h | 17 ++++ mm/util.c | 90 +++++++++++++++------- mm/vma.c | 1 - tools/testing/vma/include/dup.h | 17 ++++ 6 files changed, 120 insertions(+), 29 deletions(-) diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index ae484d371861..f14b35ee11d5 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -25,6 +25,21 @@ That is - no resources should be allocated nor state updated to reflect that a mapping has been established, as the mapping may either be merged, or fail to be mapped after the callback is complete. +Mapped callback +--------------- + +If resources need to be allocated per-mapping, or state such as a reference +count needs to be manipulated, this should be done using the ``vm_ops->mapped`` +hook, which itself should be set by the >mmap_prepare hook. + +This callback is only invoked if a new mapping has been established and was not +merged with any other, and is invoked at a point where no error may occur before +the mapping is established. + +You may return an error to the callback itself, which will cause the mapping to +become unmapped and an error returned to the mmap() caller. This is useful if +resources need to be allocated, and that allocation might fail. + How To Use ========== diff --git a/include/linux/fs.h b/include/linux/fs.h index a2628a12bd2b..c390f5c667e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file) } int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); +int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { + int err; + if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - return file->f_op->mmap(file, vma); + err = file->f_op->mmap(file, vma); + if (err) + return err; + + return __vma_check_mmap_hook(vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) diff --git a/include/linux/mm.h b/include/linux/mm.h index 21a2eef5f8fe..81fbcfed44dd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -775,6 +775,23 @@ struct vm_operations_struct { * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct *vma); + /** + * @mapped: Called when the VMA is first mapped in the MM. Not called if + * the new VMA is merged with an adjacent VMA. + * + * The @vm_private_data field is an output field allowing the user to + * modify vma->vm_private_data as necessary. + * + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if + * set from f_op->mmap. + * + * Returns %0 on success, or an error otherwise. On error, the VMA will + * be unmapped. + * + * Context: User context. May sleep. Caller holds mmap_lock. + */ + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *vma, unsigned long addr); int (*mremap)(struct vm_area_struct *vma); diff --git a/mm/util.c b/mm/util.c index e272efca8c0e..98fe67e59ec3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,33 +1163,7 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -/** - * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an - * existing VMA and execute any requested actions. - * @file: The file which possesss an f_op->mmap_prepare() hook. - * @vma: The VMA to apply the .mmap_prepare() hook to. - * - * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * stacked filesystems invoke a nested mmap hook of an underlying file. - * - * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these stacked filesystems using the - * deprecated .mmap() hook. - * - * However we have a problem if the underlying file system possesses an - * .mmap_prepare() hook, as we are in a different context when we invoke the - * .mmap() hook, already having a VMA to deal with. - * - * compat_vma_mmap() is a compatibility function that takes VMA state, - * establishes a struct vm_area_desc descriptor, passes to the underlying - * .mmap_prepare() hook and applies any changes performed by it. - * - * Once the conversion of filesystems is complete this function will no longer - * be required and will be removed. - * - * Returns: 0 on success or error. - */ -int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, @@ -1221,8 +1195,49 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) set_vma_from_desc(vma, &desc); return mmap_action_complete(vma, action); } + +/** + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. + * @file: The file which possesss an f_op->mmap_prepare() hook. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * stacked filesystems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these stacked filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +{ + return __compat_vma_mmap(file, vma); +} EXPORT_SYMBOL(compat_vma_mmap); +int __vma_check_mmap_hook(struct vm_area_struct *vma) +{ + /* vm_ops->mapped is not valid if mmap() is specified. */ + if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(__vma_check_mmap_hook); + static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) { @@ -1311,11 +1326,32 @@ again: } } +static int call_vma_mapped(struct vm_area_struct *vma) +{ + const struct vm_operations_struct *vm_ops = vma->vm_ops; + void *vm_private_data = vma->vm_private_data; + int err; + + if (!vm_ops || !vm_ops->mapped) + return 0; + + err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_file, &vm_private_data); + if (err) + return err; + + if (vm_private_data != vma->vm_private_data) + vma->vm_private_data = vm_private_data; + return 0; +} + static int mmap_action_finish(struct vm_area_struct *vma, struct mmap_action *action, int err) { size_t len; + if (!err) + err = call_vma_mapped(vma); if (!err && action->success_hook) err = action->success_hook(vma); diff --git a/mm/vma.c b/mm/vma.c index e1950ae048e2..a43f3c5d4b3d 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2781,7 +2781,6 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, if (have_mmap_prepare && allocated_new) { error = mmap_action_complete(vma, &desc.action); - if (error) return error; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index a95a4b07f68b..1fb7bcae4f31 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -643,6 +643,23 @@ struct vm_operations_struct { * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct *vma); + /** + * @mapped: Called when the VMA is first mapped in the MM. Not called if + * the new VMA is merged with an adjacent VMA. + * + * The @vm_private_data field is an output field allowing the user to + * modify vma->vm_private_data as necessary. + * + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if + * set from f_op->mmap. + * + * Returns %0 on success, or an error otherwise. On error, the VMA will + * be unmapped. + * + * Context: User context. May sleep. Caller holds mmap_lock. + */ + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *vma, unsigned long addr); int (*mremap)(struct vm_area_struct *vma); From fbfc6578eaca12daa0c09df1e9ba7f2c657b49da Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:35 +0000 Subject: [PATCH 342/369] fs: afs: revert mmap_prepare() change Partially reverts commit 9d5403b1036c ("fs: convert most other generic_file_*mmap() users to .mmap_prepare()"). This is because the .mmap invocation establishes a refcount, but .mmap_prepare is called at a point where a merge or an allocation failure might happen after the call, which would leak the refcount increment. Functionality is being added to permit the use of .mmap_prepare in this case, but in the interim, we need to fix this. Link: https://lkml.kernel.org/r/08804c94e39d9102a3a8fbd12385e8aa079ba1d3.1774045440.git.ljs@kernel.org Fixes: 9d5403b1036c ("fs: convert most other generic_file_*mmap() users to .mmap_prepare()") Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Vlastimil Babka (SUSE) Cc: Wei Liu Cc: Signed-off-by: Andrew Morton --- fs/afs/file.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index f609366fd2ac..74d04af51ff4 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,7 +19,7 @@ #include #include "internal.h" -static int afs_file_mmap_prepare(struct vm_area_desc *desc); +static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = { .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, .write_iter = netfs_file_write_iter, - .mmap_prepare = afs_file_mmap_prepare, + .mmap = afs_file_mmap, .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = afs_fsync, @@ -492,16 +492,16 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode) /* * Handle setting up a memory mapping on an AFS file. */ -static int afs_file_mmap_prepare(struct vm_area_desc *desc) +static int afs_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; afs_add_open_mmap(vnode); - ret = generic_file_mmap_prepare(desc); + ret = generic_file_mmap(file, vma); if (ret == 0) - desc->vm_ops = &afs_vm_ops; + vma->vm_ops = &afs_vm_ops; else afs_drop_open_mmap(vnode); return ret; From 4995c67d4ed32b78415d9ed9237808fdf6a40c6d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:36 +0000 Subject: [PATCH 343/369] fs: afs: restore mmap_prepare implementation Commit 9d5403b1036c ("fs: convert most other generic_file_*mmap() users to .mmap_prepare()") updated AFS to use the mmap_prepare callback in favour of the deprecated mmap callback. However, it did not account for the fact that mmap_prepare is called pre-merge, and may then be merged, nor that mmap_prepare can fail to map due to an out of memory error. This change was therefore since reverted. Both of those are cases in which we should not be incrementing a reference count. With the newly added vm_ops->mapped callback available, we can simply defer this operation to that callback which is only invoked once the mapping is successfully in place (but not yet visible to userspace as the mmap and VMA write locks are held). This allows us to once again reimplement the .mmap_prepare implementation for this file system. Therefore add afs_mapped() to implement this callback for AFS, and remove the code doing so in afs_mmap_prepare(). Also update afs_vm_open(), afs_vm_close() and afs_vm_map_pages() to be consistent in how the vnode is accessed. Link: https://lkml.kernel.org/r/ad9a94350a9c7d2bdab79fc397ef0f64d3412d71.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- fs/afs/file.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index 74d04af51ff4..85696ac984cc 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,7 +19,7 @@ #include #include "internal.h" -static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); +static int afs_file_mmap_prepare(struct vm_area_desc *desc); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -28,6 +28,8 @@ static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, static void afs_vm_open(struct vm_area_struct *area); static void afs_vm_close(struct vm_area_struct *area); static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); +static int afs_mapped(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data); const struct file_operations afs_file_operations = { .open = afs_open, @@ -35,7 +37,7 @@ const struct file_operations afs_file_operations = { .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, .write_iter = netfs_file_write_iter, - .mmap = afs_file_mmap, + .mmap_prepare = afs_file_mmap_prepare, .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = afs_fsync, @@ -61,6 +63,7 @@ const struct address_space_operations afs_file_aops = { }; static const struct vm_operations_struct afs_vm_ops = { + .mapped = afs_mapped, .open = afs_vm_open, .close = afs_vm_close, .fault = filemap_fault, @@ -492,34 +495,47 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode) /* * Handle setting up a memory mapping on an AFS file. */ -static int afs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int afs_file_mmap_prepare(struct vm_area_desc *desc) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; - afs_add_open_mmap(vnode); + ret = generic_file_mmap_prepare(desc); + if (ret) + return ret; - ret = generic_file_mmap(file, vma); - if (ret == 0) - vma->vm_ops = &afs_vm_ops; - else - afs_drop_open_mmap(vnode); + desc->vm_ops = &afs_vm_ops; return ret; } +static int afs_mapped(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + + afs_add_open_mmap(vnode); + return 0; +} + static void afs_vm_open(struct vm_area_struct *vma) { - afs_add_open_mmap(AFS_FS_I(file_inode(vma->vm_file))); + struct file *file = vma->vm_file; + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + + afs_add_open_mmap(vnode); } static void afs_vm_close(struct vm_area_struct *vma) { - afs_drop_open_mmap(AFS_FS_I(file_inode(vma->vm_file))); + struct file *file = vma->vm_file; + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + + afs_drop_open_mmap(vnode); } static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); + struct file *file = vmf->vma->vm_file; + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); if (afs_check_validity(vnode)) return filemap_map_pages(vmf, start_pgoff, end_pgoff); From a1b7fb40cb71a33c68a609fcee0946425d698415 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:37 +0000 Subject: [PATCH 344/369] mm: add mmap_action_simple_ioremap() Currently drivers use vm_iomap_memory() as a simple helper function for I/O remapping memory over a range starting at a specified physical address over a specified length. In order to utilise this from mmap_prepare, separate out the core logic into __simple_ioremap_prep(), update vm_iomap_memory() to use it, and add simple_ioremap_prepare() to do the same with a VMA descriptor object. We also add MMAP_SIMPLE_IO_REMAP and relevant fields to the struct mmap_action type to permit this operation also. We use mmap_action_ioremap() to set up the actual I/O remap operation once we have checked and figured out the parameters, which makes simple_ioremap_prepare() easy to implement. We then add mmap_action_simple_ioremap() to allow drivers to make use of this mode. We update the mmap_prepare documentation to describe this mode. Finally, we update the VMA tests to reflect this change. Link: https://lkml.kernel.org/r/a08ef1c4542202684da63bb37f459d5dbbeddd91.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 3 + include/linux/mm.h | 24 +++++- include/linux/mm_types.h | 6 +- mm/internal.h | 1 + mm/memory.c | 85 +++++++++++++++------- mm/util.c | 5 ++ tools/testing/vma/include/dup.h | 6 +- 7 files changed, 102 insertions(+), 28 deletions(-) diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index f14b35ee11d5..14bb057be564 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -153,5 +153,8 @@ pointer. These are: * mmap_action_ioremap_full() - Same as mmap_action_ioremap(), only remaps the entire mapping from ``start_pfn`` onward. +* mmap_action_simple_ioremap() - Sets up an I/O remap from a specified + physical address and over a specified length. + **NOTE:** The ``action`` field should never normally be manipulated directly, rather you ought to use one of these helpers. diff --git a/include/linux/mm.h b/include/linux/mm.h index 81fbcfed44dd..53b21de40f87 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4321,11 +4321,33 @@ static inline void mmap_action_ioremap(struct vm_area_desc *desc, * @start_pfn: The first PFN in the range to remap. */ static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, - unsigned long start_pfn) + unsigned long start_pfn) { mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); } +/** + * mmap_action_simple_ioremap - helper for mmap_prepare hook to specify that the + * physical range in [start_phys_addr, start_phys_addr + size) should be I/O + * remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_phys_addr: Start of the physical memory to be mapped. + * @size: Size of the area to map. + * + * NOTE: Some drivers might want to tweak desc->page_prot for purposes of + * write-combine or similar. + */ +static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, + phys_addr_t start_phys_addr, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + action->simple_ioremap.start_phys_addr = start_phys_addr; + action->simple_ioremap.size = size; + action->type = MMAP_SIMPLE_IO_REMAP; +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38fe6b915024..91a3db174d78 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -814,6 +814,7 @@ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ + MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ }; /* @@ -822,13 +823,16 @@ enum mmap_action_type { */ struct mmap_action { union { - /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; + struct { + phys_addr_t start_phys_addr; + unsigned long size; + } simple_ioremap; }; enum mmap_action_type type; diff --git a/mm/internal.h b/mm/internal.h index 241510e21f4b..c693646e5b3f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1842,6 +1842,7 @@ int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); int remap_pfn_range_prepare(struct vm_area_desc *desc); int remap_pfn_range_complete(struct vm_area_struct *vma, struct mmap_action *action); +int simple_ioremap_prepare(struct vm_area_desc *desc); static inline int io_remap_pfn_range_prepare(struct vm_area_desc *desc) { diff --git a/mm/memory.c b/mm/memory.c index 10a61dd81f97..c1c323512939 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3170,6 +3170,58 @@ int remap_pfn_range_complete(struct vm_area_struct *vma, return do_remap_pfn_range(vma, start, pfn, size, prot); } +static int __simple_ioremap_prep(unsigned long vm_len, pgoff_t vm_pgoff, + phys_addr_t start_phys, unsigned long size, + unsigned long *pfnp) +{ + unsigned long pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start_phys + size < start_phys) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + size += start_phys & ~PAGE_MASK; + pfn = start_phys >> PAGE_SHIFT; + pages = (size + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vm_pgoff > pages) + return -EINVAL; + pfn += vm_pgoff; + pages -= vm_pgoff; + + /* Can we fit all of the mapping? */ + if ((vm_len >> PAGE_SHIFT) > pages) + return -EINVAL; + + *pfnp = pfn; + return 0; +} + +int simple_ioremap_prepare(struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + const phys_addr_t start = action->simple_ioremap.start_phys_addr; + const unsigned long size = action->simple_ioremap.size; + unsigned long pfn; + int err; + + err = __simple_ioremap_prep(vma_desc_size(desc), desc->pgoff, + start, size, &pfn); + if (err) + return err; + + /* The I/O remap logic does the heavy lifting. */ + mmap_action_ioremap_full(desc, pfn); + return io_remap_pfn_range_prepare(desc); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to @@ -3187,32 +3239,15 @@ int remap_pfn_range_complete(struct vm_area_struct *vma, */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { - unsigned long vm_len, pfn, pages; + const unsigned long vm_start = vma->vm_start; + const unsigned long vm_end = vma->vm_end; + const unsigned long vm_len = vm_end - vm_start; + unsigned long pfn; + int err; - /* Check that the physical memory area passed in looks valid */ - if (start + len < start) - return -EINVAL; - /* - * You *really* shouldn't map things that aren't page-aligned, - * but we've historically allowed it because IO memory might - * just have smaller alignment. - */ - len += start & ~PAGE_MASK; - pfn = start >> PAGE_SHIFT; - pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; - if (pfn + pages < pfn) - return -EINVAL; - - /* We start the mapping 'vm_pgoff' pages into the area */ - if (vma->vm_pgoff > pages) - return -EINVAL; - pfn += vma->vm_pgoff; - pages -= vma->vm_pgoff; - - /* Can we fit all of the mapping? */ - vm_len = vma->vm_end - vma->vm_start; - if (vm_len >> PAGE_SHIFT > pages) - return -EINVAL; + err = __simple_ioremap_prep(vm_len, vma->vm_pgoff, start, len, &pfn); + if (err) + return err; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); diff --git a/mm/util.c b/mm/util.c index 98fe67e59ec3..9a27d33273fd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1393,6 +1393,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) return remap_pfn_range_prepare(desc); case MMAP_IO_REMAP_PFN: return io_remap_pfn_range_prepare(desc); + case MMAP_SIMPLE_IO_REMAP: + return simple_ioremap_prepare(desc); } WARN_ON_ONCE(1); @@ -1421,6 +1423,7 @@ int mmap_action_complete(struct vm_area_struct *vma, err = remap_pfn_range_complete(vma, action); break; case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: /* Should have been delegated. */ WARN_ON_ONCE(1); err = -EINVAL; @@ -1438,6 +1441,7 @@ int mmap_action_prepare(struct vm_area_desc *desc) break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } @@ -1456,6 +1460,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: + case MMAP_SIMPLE_IO_REMAP: WARN_ON_ONCE(1); /* nommu cannot handle this. */ err = -EINVAL; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 1fb7bcae4f31..b31207bbe10d 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -453,6 +453,7 @@ enum mmap_action_type { MMAP_NOTHING, /* Mapping is complete, no further action. */ MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ + MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ }; /* @@ -461,13 +462,16 @@ enum mmap_action_type { */ struct mmap_action { union { - /* Remap range. */ struct { unsigned long start; unsigned long start_pfn; unsigned long size; pgprot_t pgprot; } remap; + struct { + phys_addr_t start_phys_addr; + unsigned long size; + } simple_ioremap; }; enum mmap_action_type type; From d8bc7934db0cb96452899ec93887e3ca6f541899 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:38 +0000 Subject: [PATCH 345/369] misc: open-dice: replace deprecated mmap hook with mmap_prepare The f_op->mmap interface is deprecated, so update driver to use its successor, mmap_prepare. The driver previously used vm_iomap_memory(), so this change replaces it with its mmap_prepare equivalent, mmap_action_simple_ioremap(). Link: https://lkml.kernel.org/r/5a83ab00195dc8d0609fa6cc525493010ac4ead1.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/misc/open-dice.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index 24c29e0f00ef..45060fb4ea27 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -86,29 +86,32 @@ static ssize_t open_dice_write(struct file *filp, const char __user *ptr, /* * Creates a mapping of the reserved memory region in user address space. */ -static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) +static int open_dice_mmap_prepare(struct vm_area_desc *desc) { + struct file *filp = desc->file; struct open_dice_drvdata *drvdata = to_open_dice_drvdata(filp); - if (vma->vm_flags & VM_MAYSHARE) { + if (vma_desc_test(desc, VMA_MAYSHARE_BIT)) { /* Do not allow userspace to modify the underlying data. */ - if (vma->vm_flags & VM_WRITE) + if (vma_desc_test(desc, VMA_WRITE_BIT)) return -EPERM; /* Ensure userspace cannot acquire VM_WRITE later. */ - vm_flags_clear(vma, VM_MAYWRITE); + vma_desc_clear_flags(desc, VMA_MAYWRITE_BIT); } /* Create write-combine mapping so all clients observe a wipe. */ - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - vm_flags_set(vma, VM_DONTCOPY | VM_DONTDUMP); - return vm_iomap_memory(vma, drvdata->rmem->base, drvdata->rmem->size); + desc->page_prot = pgprot_writecombine(desc->page_prot); + vma_desc_set_flags(desc, VMA_DONTCOPY_BIT, VMA_DONTDUMP_BIT); + mmap_action_simple_ioremap(desc, drvdata->rmem->base, + drvdata->rmem->size); + return 0; } static const struct file_operations open_dice_fops = { .owner = THIS_MODULE, .read = open_dice_read, .write = open_dice_write, - .mmap = open_dice_mmap, + .mmap_prepare = open_dice_mmap_prepare, }; static int __init open_dice_probe(struct platform_device *pdev) From 10de8b811eb4a770403014a4d7b7e9ff94ddc6a6 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:39 +0000 Subject: [PATCH 346/369] hpet: replace deprecated mmap hook with mmap_prepare The f_op->mmap interface is deprecated, so update driver to use its successor, mmap_prepare. The driver previously used vm_iomap_memory(), so this change replaces it with its mmap_prepare equivalent, mmap_action_simple_ioremap(). Link: https://lkml.kernel.org/r/094c5fcfb2459a4f6d791b1fb852b01e252a44d4.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/char/hpet.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 60dd09a56f50..8f128cc40147 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -354,8 +354,9 @@ static __init int hpet_mmap_enable(char *str) } __setup("hpet_mmap=", hpet_mmap_enable); -static int hpet_mmap(struct file *file, struct vm_area_struct *vma) +static int hpet_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct hpet_dev *devp; unsigned long addr; @@ -368,11 +369,12 @@ static int hpet_mmap(struct file *file, struct vm_area_struct *vma) if (addr & (PAGE_SIZE - 1)) return -ENOSYS; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - return vm_iomap_memory(vma, addr, PAGE_SIZE); + desc->page_prot = pgprot_noncached(desc->page_prot); + mmap_action_simple_ioremap(desc, addr, PAGE_SIZE); + return 0; } #else -static int hpet_mmap(struct file *file, struct vm_area_struct *vma) +static int hpet_mmap_prepare(struct vm_area_desc *desc) { return -ENOSYS; } @@ -710,7 +712,7 @@ static const struct file_operations hpet_fops = { .open = hpet_open, .release = hpet_release, .fasync = hpet_fasync, - .mmap = hpet_mmap, + .mmap_prepare = hpet_mmap_prepare, }; static int hpet_is_known(struct hpet_data *hdp) From 0858653748eec59c1133fafc37b2bb7f6b20b6b4 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:40 +0000 Subject: [PATCH 347/369] mtdchar: replace deprecated mmap hook with mmap_prepare, clean up Replace the deprecated mmap callback with mmap_prepare. Commit f5cf8f07423b ("mtd: Disable mtdchar mmap on MMU systems") commented out the CONFIG_MMU part of this function back in 2012, so after ~14 years it's probably reasonable to remove this altogether rather than updating dead code. Link: https://lkml.kernel.org/r/d036855c21962c58ace0eb24ecd6d973d77424fe.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Richard Weinberger Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/mtd/mtdchar.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 55a43682c567..bf01e6ac7293 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -1376,27 +1376,12 @@ static unsigned mtdchar_mmap_capabilities(struct file *file) /* * set up a mapping for shared memory segments */ -static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma) +static int mtdchar_mmap_prepare(struct vm_area_desc *desc) { #ifdef CONFIG_MMU - struct mtd_file_info *mfi = file->private_data; - struct mtd_info *mtd = mfi->mtd; - struct map_info *map = mtd->priv; - - /* This is broken because it assumes the MTD device is map-based - and that mtd->priv is a valid struct map_info. It should be - replaced with something that uses the mtd_get_unmapped_area() - operation properly. */ - if (0 /*mtd->type == MTD_RAM || mtd->type == MTD_ROM*/) { -#ifdef pgprot_noncached - if (file->f_flags & O_DSYNC || map->phys >= __pa(high_memory)) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); -#endif - return vm_iomap_memory(vma, map->phys, map->size); - } return -ENODEV; #else - return vma->vm_flags & VM_SHARED ? 0 : -EACCES; + return vma_desc_test(desc, VMA_SHARED_BIT) ? 0 : -EACCES; #endif } @@ -1411,7 +1396,7 @@ static const struct file_operations mtd_fops = { #endif .open = mtdchar_open, .release = mtdchar_close, - .mmap = mtdchar_mmap, + .mmap_prepare = mtdchar_mmap_prepare, #ifndef CONFIG_MMU .get_unmapped_area = mtdchar_get_unmapped_area, .mmap_capabilities = mtdchar_mmap_capabilities, From b0085cb94d2477787f4197a3000e33aabf3d48fc Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:41 +0000 Subject: [PATCH 348/369] stm: replace deprecated mmap hook with mmap_prepare The f_op->mmap interface is deprecated, so update driver to use its successor, mmap_prepare. The driver previously used vm_iomap_memory(), so this change replaces it with its mmap_prepare equivalent, mmap_action_simple_ioremap(). Also, in order to correctly maintain reference counting, add a vm_ops->mapped callback to increment the reference count when successfully mapped. Link: https://lkml.kernel.org/r/9f3d559a264a83cf45518fcf35cc7ef1d7dfd500.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/hwtracing/stm/core.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index 37584e786bb5..f48c6a8a0654 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -666,6 +666,16 @@ static ssize_t stm_char_write(struct file *file, const char __user *buf, return count; } +static int stm_mmap_mapped(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data) +{ + struct stm_file *stmf = file->private_data; + struct stm_device *stm = stmf->stm; + + pm_runtime_get_sync(&stm->dev); + return 0; +} + static void stm_mmap_open(struct vm_area_struct *vma) { struct stm_file *stmf = vma->vm_file->private_data; @@ -684,12 +694,14 @@ static void stm_mmap_close(struct vm_area_struct *vma) } static const struct vm_operations_struct stm_mmap_vmops = { + .mapped = stm_mmap_mapped, .open = stm_mmap_open, .close = stm_mmap_close, }; -static int stm_char_mmap(struct file *file, struct vm_area_struct *vma) +static int stm_char_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct stm_file *stmf = file->private_data; struct stm_device *stm = stmf->stm; unsigned long size, phys; @@ -697,10 +709,10 @@ static int stm_char_mmap(struct file *file, struct vm_area_struct *vma) if (!stm->data->mmio_addr) return -EOPNOTSUPP; - if (vma->vm_pgoff) + if (desc->pgoff) return -EINVAL; - size = vma->vm_end - vma->vm_start; + size = vma_desc_size(desc); if (stmf->output.nr_chans * stm->data->sw_mmiosz != size) return -EINVAL; @@ -712,13 +724,12 @@ static int stm_char_mmap(struct file *file, struct vm_area_struct *vma) if (!phys) return -EINVAL; - pm_runtime_get_sync(&stm->dev); - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); - vma->vm_ops = &stm_mmap_vmops; - vm_iomap_memory(vma, phys, size); + desc->page_prot = pgprot_noncached(desc->page_prot); + vma_desc_set_flags(desc, VMA_IO_BIT, VMA_DONTEXPAND_BIT, + VMA_DONTDUMP_BIT); + desc->vm_ops = &stm_mmap_vmops; + mmap_action_simple_ioremap(desc, phys, size); return 0; } @@ -836,7 +847,7 @@ static const struct file_operations stm_fops = { .open = stm_char_open, .release = stm_char_release, .write = stm_char_write, - .mmap = stm_char_mmap, + .mmap_prepare = stm_char_mmap_prepare, .unlocked_ioctl = stm_char_ioctl, .compat_ioctl = compat_ptr_ioctl, }; From 14beec0344954b8a84cb1575ae18667fe9205080 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:42 +0000 Subject: [PATCH 349/369] staging: vme_user: replace deprecated mmap hook with mmap_prepare The f_op->mmap interface is deprecated, so update driver to use its successor, mmap_prepare. The driver previously used vm_iomap_memory(), so this change replaces it with its mmap_prepare equivalent, mmap_action_simple_ioremap(). Functions that wrap mmap() are also converted to wrap mmap_prepare() instead. Also update the documentation accordingly. Link: https://lkml.kernel.org/r/08ecc1e1d319564fd49b9e9012f994edaff921db.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/driver-api/vme.rst | 2 +- drivers/staging/vme_user/vme.c | 20 +++++------ drivers/staging/vme_user/vme.h | 2 +- drivers/staging/vme_user/vme_user.c | 51 +++++++++++++++++------------ 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/Documentation/driver-api/vme.rst b/Documentation/driver-api/vme.rst index c0b475369de0..7111999abc14 100644 --- a/Documentation/driver-api/vme.rst +++ b/Documentation/driver-api/vme.rst @@ -107,7 +107,7 @@ The function :c:func:`vme_master_read` can be used to read from and In addition to simple reads and writes, :c:func:`vme_master_rmw` is provided to do a read-modify-write transaction. Parts of a VME window can also be mapped -into user space memory using :c:func:`vme_master_mmap`. +into user space memory using :c:func:`vme_master_mmap_prepare`. Slave windows diff --git a/drivers/staging/vme_user/vme.c b/drivers/staging/vme_user/vme.c index f10a00c05f12..7220aba7b919 100644 --- a/drivers/staging/vme_user/vme.c +++ b/drivers/staging/vme_user/vme.c @@ -735,9 +735,9 @@ unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask, EXPORT_SYMBOL(vme_master_rmw); /** - * vme_master_mmap - Mmap region of VME master window. + * vme_master_mmap_prepare - Mmap region of VME master window. * @resource: Pointer to VME master resource. - * @vma: Pointer to definition of user mapping. + * @desc: Pointer to descriptor of user mapping. * * Memory map a region of the VME master window into user space. * @@ -745,12 +745,13 @@ EXPORT_SYMBOL(vme_master_rmw); * resource or -EFAULT if map exceeds window size. Other generic mmap * errors may also be returned. */ -int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma) +int vme_master_mmap_prepare(struct vme_resource *resource, + struct vm_area_desc *desc) { + const unsigned long vma_size = vma_desc_size(desc); struct vme_bridge *bridge = find_bridge(resource); struct vme_master_resource *image; phys_addr_t phys_addr; - unsigned long vma_size; if (resource->type != VME_MASTER) { dev_err(bridge->parent, "Not a master resource\n"); @@ -758,19 +759,18 @@ int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma) } image = list_entry(resource->entry, struct vme_master_resource, list); - phys_addr = image->bus_resource.start + (vma->vm_pgoff << PAGE_SHIFT); - vma_size = vma->vm_end - vma->vm_start; + phys_addr = image->bus_resource.start + (desc->pgoff << PAGE_SHIFT); if (phys_addr + vma_size > image->bus_resource.end + 1) { dev_err(bridge->parent, "Map size cannot exceed the window size\n"); return -EFAULT; } - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - return vm_iomap_memory(vma, phys_addr, vma->vm_end - vma->vm_start); + desc->page_prot = pgprot_noncached(desc->page_prot); + mmap_action_simple_ioremap(desc, phys_addr, vma_size); + return 0; } -EXPORT_SYMBOL(vme_master_mmap); +EXPORT_SYMBOL(vme_master_mmap_prepare); /** * vme_master_free - Free VME master window diff --git a/drivers/staging/vme_user/vme.h b/drivers/staging/vme_user/vme.h index 797e9940fdd1..b6413605ea49 100644 --- a/drivers/staging/vme_user/vme.h +++ b/drivers/staging/vme_user/vme.h @@ -151,7 +151,7 @@ ssize_t vme_master_read(struct vme_resource *resource, void *buf, size_t count, ssize_t vme_master_write(struct vme_resource *resource, void *buf, size_t count, loff_t offset); unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask, unsigned int compare, unsigned int swap, loff_t offset); -int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma); +int vme_master_mmap_prepare(struct vme_resource *resource, struct vm_area_desc *desc); void vme_master_free(struct vme_resource *resource); struct vme_resource *vme_dma_request(struct vme_dev *vdev, u32 route); diff --git a/drivers/staging/vme_user/vme_user.c b/drivers/staging/vme_user/vme_user.c index d95dd7d9190a..11e25c2f6b0a 100644 --- a/drivers/staging/vme_user/vme_user.c +++ b/drivers/staging/vme_user/vme_user.c @@ -446,24 +446,14 @@ static void vme_user_vm_close(struct vm_area_struct *vma) kfree(vma_priv); } -static const struct vm_operations_struct vme_user_vm_ops = { - .open = vme_user_vm_open, - .close = vme_user_vm_close, -}; - -static int vme_user_master_mmap(unsigned int minor, struct vm_area_struct *vma) +static int vme_user_vm_mapped(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data) { - int err; + const unsigned int minor = iminor(file_inode(file)); struct vme_user_vma_priv *vma_priv; mutex_lock(&image[minor].mutex); - err = vme_master_mmap(image[minor].resource, vma); - if (err) { - mutex_unlock(&image[minor].mutex); - return err; - } - vma_priv = kmalloc_obj(*vma_priv); if (!vma_priv) { mutex_unlock(&image[minor].mutex); @@ -472,22 +462,41 @@ static int vme_user_master_mmap(unsigned int minor, struct vm_area_struct *vma) vma_priv->minor = minor; refcount_set(&vma_priv->refcnt, 1); - vma->vm_ops = &vme_user_vm_ops; - vma->vm_private_data = vma_priv; - + *vm_private_data = vma_priv; image[minor].mmap_count++; mutex_unlock(&image[minor].mutex); - return 0; } -static int vme_user_mmap(struct file *file, struct vm_area_struct *vma) +static const struct vm_operations_struct vme_user_vm_ops = { + .mapped = vme_user_vm_mapped, + .open = vme_user_vm_open, + .close = vme_user_vm_close, +}; + +static int vme_user_master_mmap_prepare(unsigned int minor, + struct vm_area_desc *desc) { - unsigned int minor = iminor(file_inode(file)); + int err; + + mutex_lock(&image[minor].mutex); + + err = vme_master_mmap_prepare(image[minor].resource, desc); + if (!err) + desc->vm_ops = &vme_user_vm_ops; + + mutex_unlock(&image[minor].mutex); + return err; +} + +static int vme_user_mmap_prepare(struct vm_area_desc *desc) +{ + const struct file *file = desc->file; + const unsigned int minor = iminor(file_inode(file)); if (type[minor] == MASTER_MINOR) - return vme_user_master_mmap(minor, vma); + return vme_user_master_mmap_prepare(minor, desc); return -ENODEV; } @@ -498,7 +507,7 @@ static const struct file_operations vme_user_fops = { .llseek = vme_user_llseek, .unlocked_ioctl = vme_user_unlocked_ioctl, .compat_ioctl = compat_ptr_ioctl, - .mmap = vme_user_mmap, + .mmap_prepare = vme_user_mmap_prepare, }; static int vme_user_match(struct vme_dev *vdev) From 668937b7b2256f4b2a982e8f69b07d9ee8f81d36 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:43 +0000 Subject: [PATCH 350/369] mm: allow handling of stacked mmap_prepare hooks in more drivers While the conversion of mmap hooks to mmap_prepare is underway, we will encounter situations where mmap hooks need to invoke nested mmap_prepare hooks. The nesting of mmap hooks is termed 'stacking'. In order to flexibly facilitate the conversion of custom mmap hooks in drivers which stack, we must split up the existing __compat_vma_mmap() function into two separate functions: * compat_set_desc_from_vma() - This allows the setting of a vm_area_desc object's fields to the relevant fields of a VMA. * __compat_vma_mmap() - Once an mmap_prepare hook has been executed upon a vm_area_desc object, this function performs any mmap actions specified by the mmap_prepare hook and then invokes its vm_ops->mapped() hook if any were specified. In ordinary cases, where a file's f_op->mmap_prepare() hook simply needs to be invoked in a stacked mmap() hook, compat_vma_mmap() can be used. However some drivers define their own nested hooks, which are invoked in turn by another hook. A concrete example is vmbus_channel->mmap_ring_buffer(), which is invoked in turn by bin_attribute->mmap(): vmbus_channel->mmap_ring_buffer() has a signature of: int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma); And bin_attribute->mmap() has a signature of: int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); And so compat_vma_mmap() cannot be used here for incremental conversion of hooks from mmap() to mmap_prepare(). There are many such instances like this, where conversion to mmap_prepare would otherwise cascade to a huge change set due to nesting of this kind. The changes in this patch mean we could now instead convert vmbus_channel->mmap_ring_buffer() to vmbus_channel->mmap_prepare_ring_buffer(), and implement something like: struct vm_area_desc desc; int err; compat_set_desc_from_vma(&desc, file, vma); err = channel->mmap_prepare_ring_buffer(channel, &desc); if (err) return err; return __compat_vma_mmap(&desc, vma); Allowing us to incrementally update this logic, and other logic like it. Unfortunately, as part of this change, we need to be able to flexibly assign to the VMA descriptor, so have to remove some of the const declarations within the structure. Also update the VMA tests to reflect the changes. Link: https://lkml.kernel.org/r/24aac3019dd34740e788d169fccbe3c62781e648.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- include/linux/fs.h | 3 + include/linux/mm_types.h | 4 +- mm/util.c | 113 +++++++++++++++++++++++--------- mm/vma.h | 2 +- tools/testing/vma/include/dup.h | 70 +++++++++++++------- 5 files changed, 134 insertions(+), 58 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index c390f5c667e3..0bdccfa70b44 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2058,6 +2058,9 @@ static inline bool can_mmap_file(struct file *file) return true; } +void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file, + const struct vm_area_struct *vma); +int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); int __vma_check_mmap_hook(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 91a3db174d78..b702c63bf0e0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -891,8 +891,8 @@ static __always_inline bool vma_flags_empty(const vma_flags_t *flags) */ struct vm_area_desc { /* Immutable state. */ - const struct mm_struct *const mm; - struct file *const file; /* May vary from vm_file in stacked callers. */ + struct mm_struct *mm; + struct file *file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; diff --git a/mm/util.c b/mm/util.c index 9a27d33273fd..5ae20876ef2c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1163,38 +1163,78 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif -static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +/** + * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA. + * @desc: A VMA descriptor whose fields need to be set. + * @file: The file object describing the file being mmap()'d. + * @vma: The VMA whose fields we wish to assign to @desc. + * + * This is a compatibility function to allow an mmap() hook to call + * mmap_prepare() hooks when drivers nest these. This function specifically + * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for + * the purposes of doing this. + * + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. + */ +void compat_set_desc_from_vma(struct vm_area_desc *desc, + const struct file *file, + const struct vm_area_struct *vma) { - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, + memset(desc, 0, sizeof(*desc)); - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, + desc->mm = vma->vm_mm; + desc->file = (struct file *)file; + desc->start = vma->vm_start; + desc->end = vma->vm_end; - .action.type = MMAP_NOTHING, /* Default */ - }; - struct mmap_action *action = &desc.action; + desc->pgoff = vma->vm_pgoff; + desc->vm_file = vma->vm_file; + desc->vma_flags = vma->flags; + desc->page_prot = vma->vm_page_prot; + + /* Default. */ + desc->action.type = MMAP_NOTHING; +} +EXPORT_SYMBOL(compat_set_desc_from_vma); + +/** + * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows + * flexibility as to how the mmap_prepare callback is invoked, which is useful + * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook. + * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been + * executed. + * @vma: The VMA to which @desc should be applied. + * + * The function assumes that you have obtained a VMA descriptor @desc from + * compat_set_desc_from_vma(), and already executed the mmap_prepare() hook upon + * it. + * + * It then performs any specified mmap actions, and invokes the vm_ops->mapped() + * hook if one is present. + * + * See the description of compat_vma_mmap() for more details. + * + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. + * + * Returns: 0 on success or error. + */ +int __compat_vma_mmap(struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ int err; - err = vfs_mmap_prepare(file, &desc); + /* Perform any preparatory tasks for mmap action. */ + err = mmap_action_prepare(desc); if (err) return err; - - err = mmap_action_prepare(&desc); - if (err) - return err; - - /* being invoked from .mmap means we don't have to enforce this. */ - action->hide_from_rmap_until_complete = false; - - set_vma_from_desc(vma, &desc); - return mmap_action_complete(vma, action); + /* Update the VMA from the descriptor. */ + compat_set_vma_from_desc(vma, desc); + /* Complete any specified mmap actions. */ + return mmap_action_complete(vma, &desc->action); } +EXPORT_SYMBOL(__compat_vma_mmap); /** * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an @@ -1203,10 +1243,10 @@ static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * stacked filesystems invoke a nested mmap hook of an underlying file. + * stacked drivers invoke a nested mmap hook of an underlying file. * - * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these stacked filesystems using the + * Until all drivers are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these stacked drivers using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an @@ -1217,14 +1257,27 @@ static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * - * Once the conversion of filesystems is complete this function will no longer - * be required and will be removed. + * Once the conversion of drivers is complete this function will no longer be + * required and will be removed. * * Returns: 0 on success or error. */ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap(file, vma); + struct vm_area_desc desc; + struct mmap_action *action; + int err; + + compat_set_desc_from_vma(&desc, file, vma); + err = vfs_mmap_prepare(file, &desc); + if (err) + return err; + action = &desc.action; + + /* being invoked from .mmmap means we don't have to enforce this. */ + action->hide_from_rmap_until_complete = false; + + return __compat_vma_mmap(&desc, vma); } EXPORT_SYMBOL(compat_vma_mmap); diff --git a/mm/vma.h b/mm/vma.h index 1bfe7e47f6be..8e4b61a7304c 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -300,7 +300,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ -static inline void set_vma_from_desc(struct vm_area_struct *vma, +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { /* diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index b31207bbe10d..ecd47d0f7d17 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -519,8 +519,8 @@ enum vma_operation { */ struct vm_area_desc { /* Immutable state. */ - const struct mm_struct *const mm; - struct file *const file; /* May vary from vm_file in stacked callers. */ + struct mm_struct *mm; + struct file *file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; @@ -1278,50 +1278,70 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) } /* Declared in vma.h. */ -static inline void set_vma_from_desc(struct vm_area_struct *vma, +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); +static inline void compat_set_desc_from_vma(struct vm_area_desc *desc, + const struct file *file, + const struct vm_area_struct *vma) +{ + memset(desc, 0, sizeof(*desc)); + + desc->mm = vma->vm_mm; + desc->file = (struct file *)file; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->vm_file = vma->vm_file; + desc->vma_flags = vma->flags; + desc->page_prot = vma->vm_page_prot; + + /* Default. */ + desc->action.type = MMAP_NOTHING; +} + +static inline unsigned long vma_pages(const struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) { return file->f_op->mmap_prepare(desc); } -static inline unsigned long vma_pages(struct vm_area_struct *vma) +static inline int __compat_vma_mmap(struct vm_area_desc *desc, + struct vm_area_struct *vma) { - return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int err; + + /* Perform any preparatory tasks for mmap action. */ + err = mmap_action_prepare(desc); + if (err) + return err; + /* Update the VMA from the descriptor. */ + compat_set_vma_from_desc(vma, desc); + /* Complete any specified mmap actions. */ + return mmap_action_complete(vma, &desc->action); } static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vma_flags = vma->flags, - .page_prot = vma->vm_page_prot, - - .action.type = MMAP_NOTHING, /* Default */ - }; - struct mmap_action *action = &desc.action; + struct vm_area_desc desc; + struct mmap_action *action; int err; + compat_set_desc_from_vma(&desc, file, vma); err = vfs_mmap_prepare(file, &desc); if (err) return err; - - err = mmap_action_prepare(&desc); - if (err) - return err; + action = &desc.action; /* being invoked from .mmmap means we don't have to enforce this. */ action->hide_from_rmap_until_complete = false; - set_vma_from_desc(vma, &desc); - return mmap_action_complete(vma, action); + return __compat_vma_mmap(&desc, vma); } static inline void vma_iter_init(struct vma_iterator *vmi, From f98cb7ca4aa44645347771c2c2a9724bc210c49e Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:44 +0000 Subject: [PATCH 351/369] drivers: hv: vmbus: replace deprecated mmap hook with mmap_prepare The f_op->mmap interface is deprecated, so update the vmbus driver to use its successor, mmap_prepare. This updates all callbacks which referenced the function pointer hv_mmap_ring_buffer to instead reference hv_mmap_prepare_ring_buffer, utilising the newly introduced compat_set_desc_from_vma() and __compat_vma_mmap() to be able to implement this change. The UIO HV generic driver is the only user of hv_create_ring_sysfs(), which is the only function which references vmbus_channel->mmap_prepare_ring_buffer which, in turn, is the only external interface to hv_mmap_prepare_ring_buffer. This patch therefore updates this caller to use mmap_prepare instead, which also previously used vm_iomap_memory(), so this change replaces it with its mmap_prepare equivalent, mmap_action_simple_ioremap(). [akpm@linux-foundation.org: restore struct vmbus_channel comment, per Michael Kelley] Link: https://lkml.kernel.org/r/05467cb62267d750e5c770147517d4df0246cda6.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Michael Kelley Tested-by: Michael Kelley Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/hv/hyperv_vmbus.h | 4 ++-- drivers/hv/vmbus_drv.c | 31 +++++++++++++++++++------------ drivers/uio/uio_hv_generic.c | 11 ++++++----- include/linux/hyperv.h | 4 ++-- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 7bd8f8486e85..31f576464f18 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -545,8 +545,8 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev) /* Create and remove sysfs entry for memory mapped ring buffers for a channel */ int hv_create_ring_sysfs(struct vmbus_channel *channel, - int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, - struct vm_area_struct *vma)); + int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_desc *desc)); int hv_remove_ring_sysfs(struct vmbus_channel *channel); #endif /* _HYPERV_VMBUS_H */ diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index bc4fc1951ae1..45625487ba36 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1951,12 +1951,19 @@ static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj, struct vm_area_struct *vma) { struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj); + struct vm_area_desc desc; + int err; /* - * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer - * is not NULL. + * hv_(create|remove)_ring_sysfs implementation ensures that + * mmap_prepare_ring_buffer is not NULL. */ - return channel->mmap_ring_buffer(channel, vma); + compat_set_desc_from_vma(&desc, filp, vma); + err = channel->mmap_prepare_ring_buffer(channel, &desc); + if (err) + return err; + + return __compat_vma_mmap(&desc, vma); } static struct bin_attribute chan_attr_ring_buffer = { @@ -2048,13 +2055,13 @@ static const struct kobj_type vmbus_chan_ktype = { /** * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel. * @channel: Pointer to vmbus_channel structure - * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of + * @hv_mmap_prepare_ring_buffer: function pointer for initializing the function to be called on mmap * channel's "ring" sysfs node, which is for the ring buffer of that channel. * Function pointer is of below type: - * int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, - * struct vm_area_struct *vma)) - * This has a pointer to the channel and a pointer to vm_area_struct, - * used for mmap, as arguments. + * int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + * struct vm_area_desc *desc)) + * This has a pointer to the channel and a pointer to vm_area_desc, + * used for mmap_prepare, as arguments. * * Sysfs node for ring buffer of a channel is created along with other fields, however its * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case @@ -2071,12 +2078,12 @@ static const struct kobj_type vmbus_chan_ktype = { * Returns 0 on success or error code on failure. */ int hv_create_ring_sysfs(struct vmbus_channel *channel, - int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, - struct vm_area_struct *vma)) + int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_desc *desc)) { struct kobject *kobj = &channel->kobj; - channel->mmap_ring_buffer = hv_mmap_ring_buffer; + channel->mmap_prepare_ring_buffer = hv_mmap_prepare_ring_buffer; channel->ring_sysfs_visible = true; return sysfs_update_group(kobj, &vmbus_chan_group); @@ -2098,7 +2105,7 @@ int hv_remove_ring_sysfs(struct vmbus_channel *channel) channel->ring_sysfs_visible = false; ret = sysfs_update_group(kobj, &vmbus_chan_group); - channel->mmap_ring_buffer = NULL; + channel->mmap_prepare_ring_buffer = NULL; return ret; } EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs); diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c index 3f8e2e27697f..29ec2d15ada8 100644 --- a/drivers/uio/uio_hv_generic.c +++ b/drivers/uio/uio_hv_generic.c @@ -154,15 +154,16 @@ static void hv_uio_rescind(struct vmbus_channel *channel) * The ring buffer is allocated as contiguous memory by vmbus_open */ static int -hv_uio_ring_mmap(struct vmbus_channel *channel, struct vm_area_struct *vma) +hv_uio_ring_mmap_prepare(struct vmbus_channel *channel, struct vm_area_desc *desc) { void *ring_buffer = page_address(channel->ringbuffer_page); if (channel->state != CHANNEL_OPENED_STATE) return -ENODEV; - return vm_iomap_memory(vma, virt_to_phys(ring_buffer), - channel->ringbuffer_pagecount << PAGE_SHIFT); + mmap_action_simple_ioremap(desc, virt_to_phys(ring_buffer), + channel->ringbuffer_pagecount << PAGE_SHIFT); + return 0; } /* Callback from VMBUS subsystem when new channel created. */ @@ -183,7 +184,7 @@ hv_uio_new_channel(struct vmbus_channel *new_sc) } set_channel_read_mode(new_sc, HV_CALL_ISR); - ret = hv_create_ring_sysfs(new_sc, hv_uio_ring_mmap); + ret = hv_create_ring_sysfs(new_sc, hv_uio_ring_mmap_prepare); if (ret) { dev_err(device, "sysfs create ring bin file failed; %d\n", ret); vmbus_close(new_sc); @@ -366,7 +367,7 @@ hv_uio_probe(struct hv_device *dev, * or decoupled from uio_hv_generic probe. Userspace programs can make use of inotify * APIs to make sure that ring is created. */ - hv_create_ring_sysfs(channel, hv_uio_ring_mmap); + hv_create_ring_sysfs(channel, hv_uio_ring_mmap_prepare); hv_set_drvdata(dev, pdata); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index dfc516c1c719..a26fb8e7cedf 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1015,8 +1015,8 @@ struct vmbus_channel { /* The max size of a packet on this channel */ u32 max_pkt_size; - /* function to mmap ring buffer memory to the channel's sysfs ring attribute */ - int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma); + /* function to mmap ring buffer memory to the channel's sysfs ring attribute */ + int (*mmap_prepare_ring_buffer)(struct vmbus_channel *channel, struct vm_area_desc *desc); /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; From 933f05f58ac6014eaac387d22a76ace8606891d1 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:45 +0000 Subject: [PATCH 352/369] uio: replace deprecated mmap hook with mmap_prepare in uio_info The f_op->mmap interface is deprecated, so update uio_info to use its successor, mmap_prepare. Therefore, replace the uio_info->mmap hook with a new uio_info->mmap_prepare hook, and update its one user, target_core_user, to both specify this new mmap_prepare hook and also to use the new vm_ops->mapped() hook to continue to maintain a correct udev->kref refcount. Then update uio_mmap() to utilise the mmap_prepare compatibility layer to invoke this callback from the uio mmap invocation. Link: https://lkml.kernel.org/r/157583e4477705b496896c7acd4ac88a937b8fa6.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- drivers/target/target_core_user.c | 26 ++++++++++++++++++-------- drivers/uio/uio.c | 10 ++++++++-- include/linux/uio_driver.h | 4 ++-- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index af95531ddd35..edc2afd5f4ee 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1860,6 +1860,17 @@ static struct page *tcmu_try_get_data_page(struct tcmu_dev *udev, uint32_t dpi) return NULL; } +static int tcmu_vma_mapped(unsigned long start, unsigned long end, pgoff_t pgoff, + const struct file *file, void **vm_private_data) +{ + struct tcmu_dev *udev = *vm_private_data; + + pr_debug("vma_mapped\n"); + + kref_get(&udev->kref); + return 0; +} + static void tcmu_vma_open(struct vm_area_struct *vma) { struct tcmu_dev *udev = vma->vm_private_data; @@ -1919,26 +1930,25 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf) } static const struct vm_operations_struct tcmu_vm_ops = { + .mapped = tcmu_vma_mapped, .open = tcmu_vma_open, .close = tcmu_vma_close, .fault = tcmu_vma_fault, }; -static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) +static int tcmu_mmap_prepare(struct uio_info *info, struct vm_area_desc *desc) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); - vma->vm_ops = &tcmu_vm_ops; + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT); + desc->vm_ops = &tcmu_vm_ops; - vma->vm_private_data = udev; + desc->private_data = udev; /* Ensure the mmap is exactly the right size */ - if (vma_pages(vma) != udev->mmap_pages) + if (vma_desc_pages(desc) != udev->mmap_pages) return -EINVAL; - tcmu_vma_open(vma); - return 0; } @@ -2253,7 +2263,7 @@ static int tcmu_configure_device(struct se_device *dev) info->irqcontrol = tcmu_irqcontrol; info->irq = UIO_IRQ_CUSTOM; - info->mmap = tcmu_mmap; + info->mmap_prepare = tcmu_mmap_prepare; info->open = tcmu_open; info->release = tcmu_release; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 5a4998e2caf8..1e4ade78ed84 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -850,8 +850,14 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) goto out; } - if (idev->info->mmap) { - ret = idev->info->mmap(idev->info, vma); + if (idev->info->mmap_prepare) { + struct vm_area_desc desc; + + compat_set_desc_from_vma(&desc, filep, vma); + ret = idev->info->mmap_prepare(idev->info, &desc); + if (ret) + goto out; + ret = __compat_vma_mmap(&desc, vma); goto out; } diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 334641e20fb1..02eaac47ac44 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -97,7 +97,7 @@ struct uio_device { * @irq_flags: flags for request_irq() * @priv: optional private data * @handler: the device's irq handler - * @mmap: mmap operation for this uio device + * @mmap_prepare: mmap_prepare operation for this uio device * @open: open operation for this uio device * @release: release operation for this uio device * @irqcontrol: disable/enable irqs when 0/1 is written to /dev/uioX @@ -112,7 +112,7 @@ struct uio_info { unsigned long irq_flags; void *priv; irqreturn_t (*handler)(int irq, struct uio_info *dev_info); - int (*mmap)(struct uio_info *info, struct vm_area_struct *vma); + int (*mmap_prepare)(struct uio_info *info, struct vm_area_desc *desc); int (*open)(struct uio_info *info, struct inode *inode); int (*release)(struct uio_info *info, struct inode *inode); int (*irqcontrol)(struct uio_info *info, s32 irq_on); From 62c65fd740e979a3967db08971b93aefcec510d4 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:46 +0000 Subject: [PATCH 353/369] mm: add mmap_action_map_kernel_pages[_full]() A user can invoke mmap_action_map_kernel_pages() to specify that the mapping should map kernel pages starting from desc->start of a specified number of pages specified in an array. In order to implement this, adjust mmap_action_prepare() to be able to return an error code, as it makes sense to assert that the specified parameters are valid as quickly as possible as well as updating the VMA flags to include VMA_MIXEDMAP_BIT as necessary. This provides an mmap_prepare equivalent of vm_insert_pages(). We additionally update the existing vm_insert_pages() code to use range_in_vma() and add a new range_in_vma_desc() helper function for the mmap_prepare case, sharing the code between the two in range_is_subset(). We add both mmap_action_map_kernel_pages() and mmap_action_map_kernel_pages_full() to allow for both partial and full VMA mappings. We update the documentation to reflect the new features. Finally, we update the VMA tests accordingly to reflect the changes. Link: https://lkml.kernel.org/r/926ac961690d856e67ec847bee2370ab3c6b9046.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- Documentation/filesystems/mmap_prepare.rst | 8 ++ include/linux/mm.h | 95 +++++++++++++++++++++- include/linux/mm_types.h | 7 ++ mm/memory.c | 42 +++++++++- mm/util.c | 7 ++ tools/testing/vma/include/dup.h | 7 ++ 6 files changed, 160 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/mmap_prepare.rst b/Documentation/filesystems/mmap_prepare.rst index 14bb057be564..82c99c95ad85 100644 --- a/Documentation/filesystems/mmap_prepare.rst +++ b/Documentation/filesystems/mmap_prepare.rst @@ -156,5 +156,13 @@ pointer. These are: * mmap_action_simple_ioremap() - Sets up an I/O remap from a specified physical address and over a specified length. +* mmap_action_map_kernel_pages() - Maps a specified array of `struct page` + pointers in the VMA from a specific offset. + +* mmap_action_map_kernel_pages_full() - Maps a specified array of `struct + page` pointers over the entire VMA. The caller must ensure there are + sufficient entries in the page array to cover the entire range of the + described VMA. + **NOTE:** The ``action`` field should never normally be manipulated directly, rather you ought to use one of these helpers. diff --git a/include/linux/mm.h b/include/linux/mm.h index 53b21de40f87..61dff7f03554 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2905,7 +2905,7 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio) * The caller must add any reference (e.g., from folio_try_get()) it might be * holding itself to the result. * - * Returns the expected folio refcount. + * Returns: the expected folio refcount. */ static inline int folio_expected_ref_count(const struct folio *folio) { @@ -4348,6 +4348,45 @@ static inline void mmap_action_simple_ioremap(struct vm_area_desc *desc, action->type = MMAP_SIMPLE_IO_REMAP; } +/** + * mmap_action_map_kernel_pages - helper for mmap_prepare hook to specify that + * @num kernel pages contained in the @pages array should be mapped to userland + * starting at virtual address @start. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @start: The virtual address from which to map them. + * @pages: An array of struct page pointers describing the memory to map. + * @nr_pages: The number of entries in the @pages aray. + */ +static inline void mmap_action_map_kernel_pages(struct vm_area_desc *desc, + unsigned long start, struct page **pages, + unsigned long nr_pages) +{ + struct mmap_action *action = &desc->action; + + action->type = MMAP_MAP_KERNEL_PAGES; + action->map_kernel.start = start; + action->map_kernel.pages = pages; + action->map_kernel.nr_pages = nr_pages; + action->map_kernel.pgoff = desc->pgoff; +} + +/** + * mmap_action_map_kernel_pages_full - helper for mmap_prepare hook to specify that + * kernel pages contained in the @pages array should be mapped to userland + * from @desc->start to @desc->end. + * @desc: The VMA descriptor for the VMA requiring kernel pags to be mapped. + * @pages: An array of struct page pointers describing the memory to map. + * + * The caller must ensure that @pages contains sufficient entries to cover the + * entire range described by @desc. + */ +static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, + struct page **pages) +{ + mmap_action_map_kernel_pages(desc, desc->start, pages, + vma_desc_pages(desc)); +} + int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, struct mmap_action *action); @@ -4364,10 +4403,59 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +/** + * range_is_subset - Is the specified inner range a subset of the outer range? + * @outer_start: The start of the outer range. + * @outer_end: The exclusive end of the outer range. + * @inner_start: The start of the inner range. + * @inner_end: The exclusive end of the inner range. + * + * Returns: %true if [inner_start, inner_end) is a subset of [outer_start, + * outer_end), otherwise %false. + */ +static inline bool range_is_subset(unsigned long outer_start, + unsigned long outer_end, + unsigned long inner_start, + unsigned long inner_end) +{ + return outer_start <= inner_start && inner_end <= outer_end; +} + +/** + * range_in_vma - is the specified [@start, @end) range a subset of the VMA? + * @vma: The VMA against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@vma->vm_start, + * @vma->vm_end), %false otherwise. + */ static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { - return (vma && vma->vm_start <= start && end <= vma->vm_end); + if (!vma) + return false; + + return range_is_subset(vma->vm_start, vma->vm_end, start, end); +} + +/** + * range_in_vma_desc - is the specified [@start, @end) range a subset of the VMA + * described by @desc, a VMA descriptor? + * @desc: The VMA descriptor against which we want to check [@start, @end). + * @start: The start of the range we wish to check. + * @end: The exclusive end of the range we wish to check. + * + * Returns: %true if [@start, @end) is a subset of [@desc->start, @desc->end), + * %false otherwise. + */ +static inline bool range_in_vma_desc(const struct vm_area_desc *desc, + unsigned long start, unsigned long end) +{ + if (!desc) + return false; + + return range_is_subset(desc->start, desc->end, start, end); } #ifdef CONFIG_MMU @@ -4411,6 +4499,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); +int map_kernel_pages_prepare(struct vm_area_desc *desc); +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b702c63bf0e0..a308e2c23b82 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -815,6 +815,7 @@ enum mmap_action_type { MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ + MMAP_MAP_KERNEL_PAGES, /* Map kernel page range from array. */ }; /* @@ -833,6 +834,12 @@ struct mmap_action { phys_addr_t start_phys_addr; unsigned long size; } simple_ioremap; + struct { + unsigned long start; + struct page **pages; + unsigned long nr_pages; + pgoff_t pgoff; + } map_kernel; }; enum mmap_action_type type; diff --git a/mm/memory.c b/mm/memory.c index c1c323512939..5d032b5293c6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2484,13 +2484,14 @@ out: int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { - const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + const unsigned long nr_pages = *num; + const unsigned long end = addr + PAGE_SIZE * nr_pages; - if (addr < vma->vm_start || end_addr >= vma->vm_end) + if (!range_in_vma(vma, addr, end)) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { - BUG_ON(mmap_read_trylock(vma->vm_mm)); - BUG_ON(vma->vm_flags & VM_PFNMAP); + VM_WARN_ON_ONCE(mmap_read_trylock(vma->vm_mm)); + VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ @@ -2498,6 +2499,39 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_pages); +int map_kernel_pages_prepare(struct vm_area_desc *desc) +{ + const struct mmap_action *action = &desc->action; + const unsigned long addr = action->map_kernel.start; + unsigned long nr_pages, end; + + if (!vma_desc_test(desc, VMA_MIXEDMAP_BIT)) { + VM_WARN_ON_ONCE(mmap_read_trylock(desc->mm)); + VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_PFNMAP_BIT)); + vma_desc_set_flags(desc, VMA_MIXEDMAP_BIT); + } + + nr_pages = action->map_kernel.nr_pages; + end = addr + PAGE_SIZE * nr_pages; + if (!range_in_vma_desc(desc, addr, end)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(map_kernel_pages_prepare); + +int map_kernel_pages_complete(struct vm_area_struct *vma, + struct mmap_action *action) +{ + unsigned long nr_pages; + + nr_pages = action->map_kernel.nr_pages; + return insert_pages(vma, action->map_kernel.start, + action->map_kernel.pages, + &nr_pages, vma->vm_page_prot); +} +EXPORT_SYMBOL(map_kernel_pages_complete); + /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to diff --git a/mm/util.c b/mm/util.c index 5ae20876ef2c..f063fd4de1e8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1448,6 +1448,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) return io_remap_pfn_range_prepare(desc); case MMAP_SIMPLE_IO_REMAP: return simple_ioremap_prepare(desc); + case MMAP_MAP_KERNEL_PAGES: + return map_kernel_pages_prepare(desc); } WARN_ON_ONCE(1); @@ -1475,6 +1477,9 @@ int mmap_action_complete(struct vm_area_struct *vma, case MMAP_REMAP_PFN: err = remap_pfn_range_complete(vma, action); break; + case MMAP_MAP_KERNEL_PAGES: + err = map_kernel_pages_complete(vma, action); + break; case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: /* Should have been delegated. */ @@ -1495,6 +1500,7 @@ int mmap_action_prepare(struct vm_area_desc *desc) case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle these. */ break; } @@ -1514,6 +1520,7 @@ int mmap_action_complete(struct vm_area_struct *vma, case MMAP_REMAP_PFN: case MMAP_IO_REMAP_PFN: case MMAP_SIMPLE_IO_REMAP: + case MMAP_MAP_KERNEL_PAGES: WARN_ON_ONCE(1); /* nommu cannot handle this. */ err = -EINVAL; diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index ecd47d0f7d17..b4864aad2db0 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -454,6 +454,7 @@ enum mmap_action_type { MMAP_REMAP_PFN, /* Remap PFN range. */ MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ MMAP_SIMPLE_IO_REMAP, /* I/O remap with guardrails. */ + MMAP_MAP_KERNEL_PAGES, /* Map kernel page range from an array. */ }; /* @@ -472,6 +473,12 @@ struct mmap_action { phys_addr_t start_phys_addr; unsigned long size; } simple_ioremap; + struct { + unsigned long start; + struct page **pages; + unsigned long nr_pages; + pgoff_t pgoff; + } map_kernel; }; enum mmap_action_type type; From 1a0fe419f6af85b3ff311be46bfbff1b615b083d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 22:39:47 +0000 Subject: [PATCH 354/369] mm: on remap assert that input range within the proposed VMA Now we have range_in_vma_desc(), update remap_pfn_range_prepare() to check whether the input range in contained within the specified VMA, so we can fail at prepare time if an invalid range is specified. This covers the I/O remap mmap actions also which ultimately call into this function, and other mmap action types either already span the full VMA or check this already. Link: https://lkml.kernel.org/r/0fc1092f4b74f3f673a58e4e3942dc83f336dd85.1774045440.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Acked-by: Vlastimil Babka (SUSE) Cc: Alexander Shishkin Cc: Alexandre Torgue Cc: Al Viro Cc: Arnd Bergmann Cc: Bodo Stroesser Cc: Christian Brauner Cc: Clemens Ladisch Cc: David Hildenbrand Cc: David Howells Cc: Dexuan Cui Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: K. Y. Srinivasan Cc: Liam Howlett Cc: Long Li Cc: Marc Dionne Cc: "Martin K. Petersen" Cc: Maxime Coquelin Cc: Michal Hocko Cc: Mike Rapoport Cc: Miquel Raynal Cc: Pedro Falcato Cc: Richard Weinberger Cc: Ryan Roberts Cc: Vignesh Raghavendra Cc: Wei Liu Signed-off-by: Andrew Morton --- mm/memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 5d032b5293c6..ea6568571131 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3142,6 +3142,9 @@ int remap_pfn_range_prepare(struct vm_area_desc *desc) const bool is_cow = vma_desc_is_cow_mapping(desc); int err; + if (!range_in_vma_desc(desc, start, end)) + return -EFAULT; + err = get_remap_pgoff(is_cow, start, end, desc->start, desc->end, pfn, &desc->pgoff); if (err) From c0ea52c18c78c33c68c350eb9d3dcdf8c513254d Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:18 +0000 Subject: [PATCH 355/369] mm/huge_memory: simplify vma_is_specal_huge() Patch series "mm/huge_memory: refactor zap_huge_pmd()", v3. zap_huge_pmd() is overly complicated, clean it up and also add an assert in the case that we encounter a buggy PMD entry that doesn't match expectations. This is motivated by a bug discovered [0] where the PMD entry was none of: * A non-DAX, PFN or mixed map. * The huge zero folio * A present PMD entry * A softleaf entry In zap_huge_pmd(), but due to the bug we manged to reach this code. It is useful to explicitly call this out rather than have an arbitrary NULL pointer dereference happen, which also improves understanding of what's going on. The series goes further to make use of vm_normal_folio_pmd() rather than implementing custom logic for retrieving the folio, and extends softleaf functionality to provide and use an equivalent softleaf function. This patch (of 13): This function is confused - it overloads the term 'special' yet again, checks for DAX but in many cases the code explicitly excludes DAX before invoking the predicate. It also unnecessarily checks for vma->vm_file - this has to be present for a driver to have set VMA_MIXEDMAP_BIT or VMA_PFNMAP_BIT. In fact, a far simpler form of this is to reverse the DAX predicate and return false if DAX is set. This makes sense from the point of view of 'special' as in vm_normal_page(), as DAX actually does potentially have retrievable folios. Also there's no need to have this in mm.h so move it to huge_memory.c. No functional change intended. Link: https://lkml.kernel.org/r/cover.1774029655.git.ljs@kernel.org Link: https://lkml.kernel.org/r/d2b65883dc4895f197c4b4a69fbf27a063463412.1774029655.git.ljs@kernel.org Link: https://lore.kernel.org/all/6b3d7ad7-49e1-407a-903d-3103704160d8@lucifer.local/ [0] Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- include/linux/mm.h | 16 ---------------- mm/huge_memory.c | 30 +++++++++++++++++++++++------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bd7f0e1d8094..61fda1672b29 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -83,7 +83,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_SPECIAL \ +#define THP_ORDERS_ALL_SPECIAL_DAX \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -92,7 +92,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL_DAX | THP_ORDERS_ALL_FILE_DEFAULT) enum tva_type { TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 61dff7f03554..8260e28205e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5068,22 +5068,6 @@ long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); -/** - * vma_is_special_huge - Are transhuge page-table entries considered special? - * @vma: Pointer to the struct vm_area_struct to consider - * - * Whether transhuge page-table entries are considered "special" following - * the definition in vm_normal_page(). - * - * Return: true if transhuge page-table entries should be considered special, - * false otherwise. - */ -static inline bool vma_is_special_huge(const struct vm_area_struct *vma) -{ - return vma_is_dax(vma) || (vma->vm_file && - (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); -} - #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1c1a7cf7b209..db390b0098d9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -100,6 +100,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } +/* If returns true, we are unable to access the VMA's folios. */ +static bool vma_is_special_huge(const struct vm_area_struct *vma) +{ + if (vma_is_dax(vma)) + return false; + return vma_test_any(vma, VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT); +} + unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, enum tva_type type, @@ -113,8 +121,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; - else if (vma_is_special_huge(vma)) - supported_orders = THP_ORDERS_ALL_SPECIAL; + else if (vma_is_dax(vma) || vma_is_special_huge(vma)) + supported_orders = THP_ORDERS_ALL_SPECIAL_DAX; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; @@ -2415,7 +2423,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); @@ -2917,7 +2925,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { @@ -3068,7 +3076,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) + if (vma_is_special_huge(vma)) return; if (unlikely(pmd_is_migration_entry(old_pmd))) { const softleaf_t old_entry = softleaf_from_pmd(old_pmd); @@ -4629,8 +4637,16 @@ next: static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) { - return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || - is_vm_hugetlb_page(vma); + if (vma_is_dax(vma)) + return true; + if (vma_is_special_huge(vma)) + return true; + if (vma_test(vma, VMA_IO_BIT)) + return true; + if (is_vm_hugetlb_page(vma)) + return true; + + return false; } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, From 6886f93790b3c1935bfb9e668a7c3f68d7eff510 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:19 +0000 Subject: [PATCH 356/369] mm/huge: avoid big else branch in zap_huge_pmd() We don't need to have an extra level of indentation, we can simply exit early in the first two branches. No functional change intended. Link: https://lkml.kernel.org/r/6b4d5efdbf5554b8fe788f677d0b50f355eec999.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Acked-by: Qi Zheng Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 103 ++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db390b0098d9..4dfffd6a1bbe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2405,8 +2405,10 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { - pmd_t orig_pmd; + struct folio *folio = NULL; + int flush_needed = 1; spinlock_t *ptl; + pmd_t orig_pmd; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); @@ -2427,59 +2429,60 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - } else if (is_huge_zero_pmd(orig_pmd)) { + return 1; + } + if (is_huge_zero_pmd(orig_pmd)) { if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - } else { - struct folio *folio = NULL; - int flush_needed = 1; - - if (pmd_present(orig_pmd)) { - struct page *page = pmd_page(orig_pmd); - - folio = page_folio(page); - folio_remove_rmap_pmd(folio, page, vma); - WARN_ON_ONCE(folio_mapcount(folio) < 0); - VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (pmd_is_valid_softleaf(orig_pmd)) { - const softleaf_t entry = softleaf_from_pmd(orig_pmd); - - folio = softleaf_to_folio(entry); - flush_needed = 0; - - if (!thp_migration_supported()) - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); - } - - if (folio_test_anon(folio)) { - zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - } else { - if (arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, mm_counter_file(folio), - -HPAGE_PMD_NR); - - /* - * Use flush_needed to indicate whether the PMD entry - * is present, instead of checking pmd_present() again. - */ - if (flush_needed && pmd_young(orig_pmd) && - likely(vma_has_recency(vma))) - folio_mark_accessed(folio); - } - - if (folio_is_device_private(folio)) { - folio_remove_rmap_pmd(folio, &folio->page, vma); - WARN_ON_ONCE(folio_mapcount(folio) < 0); - folio_put(folio); - } - - spin_unlock(ptl); - if (flush_needed) - tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); + return 1; } + + if (pmd_present(orig_pmd)) { + struct page *page = pmd_page(orig_pmd); + + folio = page_folio(page); + folio_remove_rmap_pmd(folio, page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + VM_BUG_ON_PAGE(!PageHead(page), page); + } else if (pmd_is_valid_softleaf(orig_pmd)) { + const softleaf_t entry = softleaf_from_pmd(orig_pmd); + + folio = softleaf_to_folio(entry); + flush_needed = 0; + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } + + if (folio_test_anon(folio)) { + zap_deposited_table(tlb->mm, pmd); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); + add_mm_counter(tlb->mm, mm_counter_file(folio), + -HPAGE_PMD_NR); + + /* + * Use flush_needed to indicate whether the PMD entry + * is present, instead of checking pmd_present() again. + */ + if (flush_needed && pmd_young(orig_pmd) && + likely(vma_has_recency(vma))) + folio_mark_accessed(folio); + } + + if (folio_is_device_private(folio)) { + folio_remove_rmap_pmd(folio, &folio->page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + folio_put(folio); + } + + spin_unlock(ptl); + if (flush_needed) + tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); + return 1; } From b92b9d4f699ce1f0ae746ebc69bca329adc07293 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:20 +0000 Subject: [PATCH 357/369] mm/huge_memory: have zap_huge_pmd return a boolean, add kdoc There's no need to use the ancient approach of returning an integer here, just return a boolean. Also update flush_needed to be a boolean, similarly. Also add a kdoc comment describing the function. No functional change intended. Link: https://lkml.kernel.org/r/132274566cd49d2960a2294c36dd2450593dfc55.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Acked-by: Qi Zheng Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 61fda1672b29..2949e5acff35 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -27,8 +27,8 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); -int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr); +bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4dfffd6a1bbe..65e554afdf16 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2402,11 +2402,20 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) mm_dec_nr_ptes(mm); } -int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, +/** + * zap_huge_pmd - Zap a huge THP which is of PMD size. + * @tlb: The MMU gather TLB state associated with the operation. + * @vma: The VMA containing the range to zap. + * @pmd: A pointer to the leaf PMD entry. + * @addr: The virtual address for the range to zap. + * + * Returns: %true on success, %false otherwise. + */ +bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { struct folio *folio = NULL; - int flush_needed = 1; + bool flush_needed = true; spinlock_t *ptl; pmd_t orig_pmd; @@ -2414,7 +2423,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) - return 0; + return false; /* * For architectures like ppc64 we look at deposited pgtable * when calling pmdp_huge_get_and_clear. So do the @@ -2429,13 +2438,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - return 1; + return true; } if (is_huge_zero_pmd(orig_pmd)) { if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); - return 1; + return true; } if (pmd_present(orig_pmd)) { @@ -2449,7 +2458,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, const softleaf_t entry = softleaf_from_pmd(orig_pmd); folio = softleaf_to_folio(entry); - flush_needed = 0; + flush_needed = false; if (!thp_migration_supported()) WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -2483,7 +2492,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); - return 1; + return true; } #ifndef pmd_move_must_withdraw From 7011140612fd13000b2ebed43e1bfb542f90b959 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:21 +0000 Subject: [PATCH 358/369] mm/huge_memory: handle buggy PMD entry in zap_huge_pmd() A recent bug I analysed managed to, through a bug in the userfaultfd implementation, reach an invalid point in the zap_huge_pmd() code where the PMD was none of: - A non-DAX, PFN or mixed map. - The huge zero folio - A present PMD entry - A softleaf entry The code at this point calls folio_test_anon() on a known-NULL folio. Having logic like this explicitly NULL dereference in the code is hard to understand, and makes debugging potentially more difficult. Add an else branch to handle this case and WARN(). No functional change intended. Link: https://lore.kernel.org/all/6b3d7ad7-49e1-407a-903d-3103704160d8@lucifer.local/ Link: https://lkml.kernel.org/r/fcf1f6de84a2ace188b6bf103fa15dde695f1ed8.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 65e554afdf16..2f9aec7d4952 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2462,6 +2462,10 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!thp_migration_supported()) WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } else { + WARN_ON_ONCE(true); + spin_unlock(ptl); + return true; } if (folio_test_anon(folio)) { From 7217744e0aa373dd6f0a62b0db610ff085e50153 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:22 +0000 Subject: [PATCH 359/369] mm/huge_memory: add a common exit path to zap_huge_pmd() Other than when we acquire the PTL, we always need to unlock the PTL, and optionally need to flush on exit. The code is currently very duplicated in this respect, so default flush_needed to false, set it true in the case in which it's required, then share the same logic for all exit paths. This also makes flush_needed make more sense as a function-scope value (we don't need to flush for the PFN map/mixed map, zero huge, error cases for instance). Link: https://lkml.kernel.org/r/6b281d8ed972dff0e89bdcbdd810c96c7ae8c9dc.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f9aec7d4952..283685dd6d9f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2415,7 +2415,7 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { struct folio *folio = NULL; - bool flush_needed = true; + bool flush_needed = false; spinlock_t *ptl; pmd_t orig_pmd; @@ -2437,19 +2437,18 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - spin_unlock(ptl); - return true; + goto out; } if (is_huge_zero_pmd(orig_pmd)) { if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - spin_unlock(ptl); - return true; + goto out; } if (pmd_present(orig_pmd)) { struct page *page = pmd_page(orig_pmd); + flush_needed = true; folio = page_folio(page); folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); @@ -2458,14 +2457,12 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, const softleaf_t entry = softleaf_from_pmd(orig_pmd); folio = softleaf_to_folio(entry); - flush_needed = false; if (!thp_migration_supported()) WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); } else { WARN_ON_ONCE(true); - spin_unlock(ptl); - return true; + goto out; } if (folio_test_anon(folio)) { @@ -2492,10 +2489,10 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_put(folio); } +out: spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); - return true; } From 07f264680ac875725ca12e6adadd9e3def2e30f2 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:23 +0000 Subject: [PATCH 360/369] mm/huge_memory: remove unnecessary VM_BUG_ON_PAGE() This has been around since the beginnings of the THP implementation. I think we can safely assume that, if we have a THP folio, it will have a head page. Link: https://lkml.kernel.org/r/f3fa8eb4634ccb2e78209f570cc1a769a02ce93e.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 283685dd6d9f..f072acd5b279 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2452,7 +2452,6 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio = page_folio(page); folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); - VM_BUG_ON_PAGE(!PageHead(page), page); } else if (pmd_is_valid_softleaf(orig_pmd)) { const softleaf_t entry = softleaf_from_pmd(orig_pmd); From d490940f3a409f640bf98f8966c8e863b2452692 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:24 +0000 Subject: [PATCH 361/369] mm/huge_memory: deduplicate zap deposited table call Rather than having separate logic for each case determining whether to zap the deposited table, simply track this via a boolean. We default this to whether the architecture requires it, and update it as required elsewhere. Link: https://lkml.kernel.org/r/71f576a1fbcd27a86322d12caa937bcdacf75407.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f072acd5b279..41506f376f4b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2414,6 +2414,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + bool has_deposit = arch_needs_pgtable_deposit(); struct folio *folio = NULL; bool flush_needed = false; spinlock_t *ptl; @@ -2434,23 +2435,19 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (vma_is_special_huge(vma)) { - if (arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); + if (vma_is_special_huge(vma)) goto out; - } if (is_huge_zero_pmd(orig_pmd)) { - if (!vma_is_dax(vma) || arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); + if (!vma_is_dax(vma)) + has_deposit = true; goto out; } if (pmd_present(orig_pmd)) { - struct page *page = pmd_page(orig_pmd); + folio = pmd_folio(orig_pmd); flush_needed = true; - folio = page_folio(page); - folio_remove_rmap_pmd(folio, page, vma); + folio_remove_rmap_pmd(folio, &folio->page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); } else if (pmd_is_valid_softleaf(orig_pmd)) { const softleaf_t entry = softleaf_from_pmd(orig_pmd); @@ -2465,11 +2462,9 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } if (folio_test_anon(folio)) { - zap_deposited_table(tlb->mm, pmd); + has_deposit = true; add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { - if (arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); @@ -2489,6 +2484,9 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } out: + if (has_deposit) + zap_deposited_table(tlb->mm, pmd); + spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); From 1fc034c1c9dd387f6f82be93326b0add6ffd49e7 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:25 +0000 Subject: [PATCH 362/369] mm/huge_memory: remove unnecessary sanity checks These checks have been in place since 2014, I think we can safely assume that we are in a place where we don't need these as runtime checks. In addition there are 4 other invocations of folio_remove_rmap_pmd(), none of which make this assertion. If we need to add this assertion, it should be in folio_remove_rmap_pmd(), and as a VM_WARN_ON_ONCE(), however these seem superfluous so just remove them. Link: https://lkml.kernel.org/r/0c4c5ab247c90f80cf44718e8124b217d6a22544.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 41506f376f4b..d7c1c8f55c13 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2448,7 +2448,6 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, flush_needed = true; folio_remove_rmap_pmd(folio, &folio->page, vma); - WARN_ON_ONCE(folio_mapcount(folio) < 0); } else if (pmd_is_valid_softleaf(orig_pmd)) { const softleaf_t entry = softleaf_from_pmd(orig_pmd); @@ -2479,7 +2478,6 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (folio_is_device_private(folio)) { folio_remove_rmap_pmd(folio, &folio->page, vma); - WARN_ON_ONCE(folio_mapcount(folio) < 0); folio_put(folio); } From 1c6b7ff60bd477bb73b737e2955c0ad49cffd7ca Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:26 +0000 Subject: [PATCH 363/369] mm/huge_memory: use mm instead of tlb->mm Reduce the repetition, and lay the ground for further refactorings by keeping this variable separate. Link: https://lkml.kernel.org/r/98104cde87e4b2aabeb16f236b8731591594457f.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d7c1c8f55c13..c515e293ed48 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2415,6 +2415,7 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { bool has_deposit = arch_needs_pgtable_deposit(); + struct mm_struct *mm = tlb->mm; struct folio *folio = NULL; bool flush_needed = false; spinlock_t *ptl; @@ -2462,9 +2463,9 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (folio_test_anon(folio)) { has_deposit = true; - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { - add_mm_counter(tlb->mm, mm_counter_file(folio), + add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); /* @@ -2483,7 +2484,7 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, out: if (has_deposit) - zap_deposited_table(tlb->mm, pmd); + zap_deposited_table(mm, pmd); spin_unlock(ptl); if (flush_needed) From f87854c9091014207ecb5bc108810ff3e4dbb08f Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:27 +0000 Subject: [PATCH 364/369] mm/huge_memory: separate out the folio part of zap_huge_pmd() Place the part of the logic that manipulates counters and possibly updates the accessed bit of the folio into its own function to make zap_huge_pmd() more readable. Also rename flush_needed to is_present as we only require a flush for present entries. Additionally add comments as to why we're doing what we're doing with respect to softleaf entries. This also lays the ground for further refactoring. Link: https://lkml.kernel.org/r/6c4db67952f5529da4db102a6149b9050b5dda4e.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Baolin Wang Reviewed-by: Suren Baghdasaryan Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 61 +++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c515e293ed48..d1e66df05c86 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2402,6 +2402,37 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) mm_dec_nr_ptes(mm); } +static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t pmdval, struct folio *folio, bool is_present, + bool *has_deposit) +{ + const bool is_device_private = folio_is_device_private(folio); + + /* Present and device private folios are rmappable. */ + if (is_present || is_device_private) + folio_remove_rmap_pmd(folio, &folio->page, vma); + + if (folio_test_anon(folio)) { + *has_deposit = true; + add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); + } else { + add_mm_counter(mm, mm_counter_file(folio), + -HPAGE_PMD_NR); + + /* + * Use flush_needed to indicate whether the PMD entry + * is present, instead of checking pmd_present() again. + */ + if (is_present && pmd_young(pmdval) && + likely(vma_has_recency(vma))) + folio_mark_accessed(folio); + } + + /* Device private folios are pinned. */ + if (is_device_private) + folio_put(folio); +} + /** * zap_huge_pmd - Zap a huge THP which is of PMD size. * @tlb: The MMU gather TLB state associated with the operation. @@ -2417,7 +2448,7 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, bool has_deposit = arch_needs_pgtable_deposit(); struct mm_struct *mm = tlb->mm; struct folio *folio = NULL; - bool flush_needed = false; + bool is_present = false; spinlock_t *ptl; pmd_t orig_pmd; @@ -2446,14 +2477,11 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { folio = pmd_folio(orig_pmd); - - flush_needed = true; - folio_remove_rmap_pmd(folio, &folio->page, vma); + is_present = true; } else if (pmd_is_valid_softleaf(orig_pmd)) { const softleaf_t entry = softleaf_from_pmd(orig_pmd); folio = softleaf_to_folio(entry); - if (!thp_migration_supported()) WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); } else { @@ -2461,33 +2489,14 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto out; } - if (folio_test_anon(folio)) { - has_deposit = true; - add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); - } else { - add_mm_counter(mm, mm_counter_file(folio), - -HPAGE_PMD_NR); - - /* - * Use flush_needed to indicate whether the PMD entry - * is present, instead of checking pmd_present() again. - */ - if (flush_needed && pmd_young(orig_pmd) && - likely(vma_has_recency(vma))) - folio_mark_accessed(folio); - } - - if (folio_is_device_private(folio)) { - folio_remove_rmap_pmd(folio, &folio->page, vma); - folio_put(folio); - } + zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present, &has_deposit); out: if (has_deposit) zap_deposited_table(mm, pmd); spin_unlock(ptl); - if (flush_needed) + if (is_present) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); return true; } From 64b7d889d03ce94940d6dd9440c4e74c1108ac78 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:28 +0000 Subject: [PATCH 365/369] mm: add softleaf_is_valid_pmd_entry(), pmd_to_softleaf_folio() Separate pmd_is_valid_softleaf() into separate components, then use the pmd_is_valid_softleaf() predicate to implement pmd_to_softleaf_folio(). This returns the folio associated with a softleaf entry at PMD level. It expects this to be valid for a PMD entry. If CONFIG_DEBUG_VM is set, then assert on this being an invalid entry, and either way return NULL in this case. This lays the ground for further refactorings. Link: https://lkml.kernel.org/r/b677592596274fa3fd701890497948e4b0e07cec.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/include/linux/leafops.h b/include/linux/leafops.h index 05673d3529e7..992cd8bd8ed0 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -607,7 +607,20 @@ static inline bool pmd_is_migration_entry(pmd_t pmd) } /** - * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * softleaf_is_valid_pmd_entry() - Is the specified softleaf entry obtained from + * a PMD one that we support at PMD level? + * @entry: Entry to check. + * Returns: true if the softleaf entry is valid at PMD, otherwise false. + */ +static inline bool softleaf_is_valid_pmd_entry(softleaf_t entry) +{ + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid softleaf entry? * @pmd: PMD entry. * * PMD leaf entries are valid only if they are device private or migration @@ -620,9 +633,27 @@ static inline bool pmd_is_valid_softleaf(pmd_t pmd) { const softleaf_t entry = softleaf_from_pmd(pmd); - /* Only device private, migration entries valid for PMD. */ - return softleaf_is_device_private(entry) || - softleaf_is_migration(entry); + return softleaf_is_valid_pmd_entry(entry); +} + +/** + * pmd_to_softleaf_folio() - Convert the PMD entry to a folio. + * @pmd: PMD entry. + * + * The PMD entry is expected to be a valid PMD softleaf entry. + * + * Returns: the folio the softleaf entry references if this is a valid softleaf + * entry, otherwise NULL. + */ +static inline struct folio *pmd_to_softleaf_folio(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (!softleaf_is_valid_pmd_entry(entry)) { + VM_WARN_ON_ONCE(true); + return NULL; + } + return softleaf_to_folio(entry); } #endif /* CONFIG_MMU */ From d80a9cb1a64ab9c817b6262c7e4e433b6a3581a0 Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:29 +0000 Subject: [PATCH 366/369] mm/huge_memory: add and use normal_or_softleaf_folio_pmd() Now we have pmd_to_softleaf_folio() available to us which also raises a CONFIG_DEBUG_VM warning if unexpectedly an invalid softleaf entry, we can now abstract folio handling altogether. vm_normal_folio() deals with the huge zero page (which is present), as well as PFN map/mixed map mappings in both cases returning NULL. Otherwise, we try to obtain the softleaf folio. This makes the logic far easier to comprehend and has it use the standard vm_normal_folio_pmd() path for decoding of present entries. Finally, we have to update the flushing logic to only do so if a folio is established. This patch also makes the 'is_present' value more accurate - because PFN map, mixed map and zero huge pages are present, just not present and 'normal'. [ljs@kernel.org: avoid bisection hazard] Link: https://lkml.kernel.org/r/d0cc6161-77a4-42ba-a411-96c23c78df1b@lucifer.local Link: https://lkml.kernel.org/r/c2be872d64ef9573b80727d9ab5446cf002f17b5.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 47 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d1e66df05c86..ade49830c699 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2419,10 +2419,6 @@ static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma, add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); - /* - * Use flush_needed to indicate whether the PMD entry - * is present, instead of checking pmd_present() again. - */ if (is_present && pmd_young(pmdval) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); @@ -2433,6 +2429,17 @@ static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma, folio_put(folio); } +static struct folio *normal_or_softleaf_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmdval, bool is_present) +{ + if (is_present) + return vm_normal_folio_pmd(vma, addr, pmdval); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + return pmd_to_softleaf_folio(pmdval); +} + /** * zap_huge_pmd - Zap a huge THP which is of PMD size. * @tlb: The MMU gather TLB state associated with the operation. @@ -2467,36 +2474,20 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (vma_is_special_huge(vma)) - goto out; - if (is_huge_zero_pmd(orig_pmd)) { - if (!vma_is_dax(vma)) - has_deposit = true; - goto out; - } - if (pmd_present(orig_pmd)) { - folio = pmd_folio(orig_pmd); - is_present = true; - } else if (pmd_is_valid_softleaf(orig_pmd)) { - const softleaf_t entry = softleaf_from_pmd(orig_pmd); + is_present = pmd_present(orig_pmd); + folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present); + if (folio) + zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present, + &has_deposit); + else if (is_huge_zero_pmd(orig_pmd)) + has_deposit = has_deposit || !vma_is_dax(vma); - folio = softleaf_to_folio(entry); - if (!thp_migration_supported()) - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); - } else { - WARN_ON_ONCE(true); - goto out; - } - - zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present, &has_deposit); - -out: if (has_deposit) zap_deposited_table(mm, pmd); spin_unlock(ptl); - if (is_present) + if (is_present && folio) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); return true; } From bf263bcaf61f6e921c458ce4c5a9100192447c8c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 20 Mar 2026 18:07:30 +0000 Subject: [PATCH 367/369] mm/huge_memory: add and use has_deposited_pgtable() Rather than thread has_deposited through zap_huge_pmd(), make things clearer by adding has_deposited_pgtable() with comments describing why in each case. [ljs@kernel.org: fix folio_put()-before-recheck issue, per Sashiko] Link: https://lkml.kernel.org/r/0a917f80-902f-49b0-a75f-1bbaf23d7f94@lucifer.local Link: https://lkml.kernel.org/r/f9db59ca90937e39913d50ecb4f662e2bad17bbb.1774029655.git.ljs@kernel.org Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Qi Zheng Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ade49830c699..745eb3d0d4a7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2403,8 +2403,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) } static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t pmdval, struct folio *folio, bool is_present, - bool *has_deposit) + pmd_t pmdval, struct folio *folio, bool is_present) { const bool is_device_private = folio_is_device_private(folio); @@ -2413,7 +2412,6 @@ static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, &folio->page, vma); if (folio_test_anon(folio)) { - *has_deposit = true; add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { add_mm_counter(mm, mm_counter_file(folio), @@ -2440,6 +2438,27 @@ static struct folio *normal_or_softleaf_folio_pmd(struct vm_area_struct *vma, return pmd_to_softleaf_folio(pmdval); } +static bool has_deposited_pgtable(struct vm_area_struct *vma, pmd_t pmdval, + struct folio *folio) +{ + /* Some architectures require unconditional depositing. */ + if (arch_needs_pgtable_deposit()) + return true; + + /* + * Huge zero always deposited except for DAX which handles itself, see + * set_huge_zero_folio(). + */ + if (is_huge_zero_pmd(pmdval)) + return !vma_is_dax(vma); + + /* + * Otherwise, only anonymous folios are deposited, see + * __do_huge_pmd_anonymous_page(). + */ + return folio && folio_test_anon(folio); +} + /** * zap_huge_pmd - Zap a huge THP which is of PMD size. * @tlb: The MMU gather TLB state associated with the operation. @@ -2452,10 +2471,10 @@ static struct folio *normal_or_softleaf_folio_pmd(struct vm_area_struct *vma, bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { - bool has_deposit = arch_needs_pgtable_deposit(); struct mm_struct *mm = tlb->mm; struct folio *folio = NULL; bool is_present = false; + bool has_deposit; spinlock_t *ptl; pmd_t orig_pmd; @@ -2477,12 +2496,9 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, is_present = pmd_present(orig_pmd); folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present); + has_deposit = has_deposited_pgtable(vma, orig_pmd, folio); if (folio) - zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present, - &has_deposit); - else if (is_huge_zero_pmd(orig_pmd)) - has_deposit = has_deposit || !vma_is_dax(vma); - + zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present); if (has_deposit) zap_deposited_table(mm, pmd); From 5a62019807da4a7add0717c962ac83b23dd12b2c Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Tue, 31 Mar 2026 13:11:18 +0100 Subject: [PATCH 368/369] mm/khugepaged: fix issue with tracking lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are incorrectly treating lock_dropped to track both whether the lock is currently held and whether or not the lock was ever dropped. Update this change to account for this. Link: https://lkml.kernel.org/r/7760c811-e100-4d40-9217-0813c28314be@lucifer.local Fixes: 330f3758a3bc ("mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()") Signed-off-by: Lorenzo Stoakes (Oracle) Reviewed-by: Lance Yang Reviewed-by: Nico Pache Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Brendan Jackman Cc: Byungchul Park Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Gregory Price Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Kefeng Wang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Matthew Brost Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Pedro Falcato Cc: Peter Xu Cc: Rafael Aquini Cc: Rakie Kim Cc: Randy Dunlap Cc: Ryan Roberts Cc: Shivank Garg Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Takashi Iwai (SUSE) Cc: Thomas Hellström Cc: Usama Arif Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yang Shi Cc: Zach O'Keefe Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d21348b85a59..b8452dbdb043 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2828,6 +2828,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, unsigned long hstart, hend, addr; enum scan_result last_fail = SCAN_FAIL; int thps = 0; + bool mmap_unlocked = false; BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); @@ -2850,10 +2851,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { enum scan_result result = SCAN_FAIL; - if (*lock_dropped) { + if (mmap_unlocked) { cond_resched(); mmap_read_lock(mm); - *lock_dropped = false; + mmap_unlocked = false; + *lock_dropped = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, cc); if (result != SCAN_SUCCEED) { @@ -2864,7 +2866,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } - result = collapse_single_pmd(addr, vma, lock_dropped, cc); + result = collapse_single_pmd(addr, vma, &mmap_unlocked, cc); switch (result) { case SCAN_SUCCEED: @@ -2893,8 +2895,10 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, out_maybelock: /* Caller expects us to hold mmap_lock on return */ - if (*lock_dropped) + if (mmap_unlocked) { + *lock_dropped = true; mmap_read_lock(mm); + } out_nolock: mmap_assert_locked(mm); mmdrop(mm); From 3bac01168982ec3e3bf87efdc1807c7933590a85 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Wed, 1 Apr 2026 21:10:32 +0800 Subject: [PATCH 369/369] mm: fix deferred split queue races during migration migrate_folio_move() records the deferred split queue state from src and replays it on dst. Replaying it after remove_migration_ptes(src, dst, 0) makes dst visible before it is requeued, so a concurrent rmap-removal path can mark dst partially mapped and trip the WARN in deferred_split_folio(). Move the requeue before remove_migration_ptes() so dst is back on the deferred split queue before it becomes visible again. Because migration still holds dst locked at that point, teach deferred_split_scan() to requeue a folio when folio_trylock() fails. Otherwise a fully mapped underused folio can be dequeued by the shrinker and silently lost from split_queue. [ziy@nvidia.com: move the comment] Link: https://lkml.kernel.org/r/FB71A764-0F10-4E5A-B4A0-BA4C7F138408@nvidia.com Link: https://syzkaller.appspot.com/bug?extid=a7067a757858ac8eb085 Link: https://lkml.kernel.org/r/20260401131032.13011-1-lance.yang@linux.dev Fixes: 8a8ca142a488 ("mm: migrate: requeue destination folio on deferred split queue") Signed-off-by: Lance Yang Signed-off-by: Zi Yan Reported-by: syzbot+a7067a757858ac8eb085@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/69ccb65b.050a0220.183828.003a.GAE@google.com/ Suggested-by: David Hildenbrand (Arm) Acked-by: David Hildenbrand (Arm) Acked-by: Zi Yan Cc: Alistair Popple Cc: Baolin Wang Cc: Barry Song Cc: Byungchul Park Cc: David Hildenbrand Cc: Deepanshu Kartikey Cc: Dev Jain Cc: Gregory Price Cc: "Huang, Ying" Cc: Joshua Hahn Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Matthew Brost Cc: Nico Pache Cc: Rakie Kim Cc: Ryan Roberts Cc: Wei Yang Cc: Ying Huang Cc: Usama Arif Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 15 ++++++++++----- mm/migrate.c | 18 +++++++++--------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 745eb3d0d4a7..42c983821c03 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4542,7 +4542,7 @@ retry: goto next; } if (!folio_trylock(folio)) - goto next; + goto requeue; if (!split_folio(folio)) { did_split = true; if (underused) @@ -4551,13 +4551,18 @@ retry: } folio_unlock(folio); next: + /* + * If thp_underused() returns false, or if split_folio() + * succeeds, or if split_folio() fails in the case it was + * underused, then consider it used and don't add it back to + * split_queue. + */ if (did_split || !folio_test_partially_mapped(folio)) continue; +requeue: /* - * Only add back to the queue if folio is partially mapped. - * If thp_underused returns false, or if split_folio fails - * in the case it was underused, then consider it used and - * don't add it back to split_queue. + * Add back partially mapped folios, or underused folios that + * we could not lock this round. */ fqueue = folio_split_queue_lock_irqsave(folio, &flags); if (list_empty(&folio->_deferred_list)) { diff --git a/mm/migrate.c b/mm/migrate.c index 4241eb6eca00..76142a02192b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1383,6 +1383,15 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, if (rc) goto out; + /* + * Requeue the destination folio on the deferred split queue if + * the source was on the queue. The source is unqueued in + * __folio_migrate_mapping(), so we recorded the state from + * before move_to_new_folio(). + */ + if (src_deferred_split) + deferred_split_folio(dst, src_partially_mapped); + /* * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will @@ -1399,15 +1408,6 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, if (old_page_state & PAGE_WAS_MAPPED) remove_migration_ptes(src, dst, 0); - /* - * Requeue the destination folio on the deferred split queue if - * the source was on the queue. The source is unqueued in - * __folio_migrate_mapping(), so we recorded the state from - * before move_to_new_folio(). - */ - if (src_deferred_split) - deferred_split_folio(dst, src_partially_mapped); - out_unlock_both: folio_unlock(dst); folio_set_owner_migrate_reason(dst, reason);