mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
powerpc uses pt_frag_refcount as a reference counter for tracking it's
pte and pmd page table fragments. For PTE table, in case of Hash with
64K pagesize, we have 16 fragments of 4K size in one 64K page.
Patch series [1] "mm: free retracted page table by RCU"
added pte_free_defer() to defer the freeing of PTE tables when
retract_page_tables() is called for madvise MADV_COLLAPSE on shmem
range.
[1]: https://lore.kernel.org/all/7cd843a9-aa80-14f-5eb2-33427363c20@google.com/
pte_free_defer() sets the active flag on the corresponding fragment's
folio & calls pte_fragment_free(), which reduces the pt_frag_refcount.
When pt_frag_refcount reaches 0 (no active fragment using the folio), it
checks if the folio active flag is set, if set, it calls call_rcu to
free the folio, it the active flag is unset then it calls pte_free_now().
Now, this can lead to following problem in a corner case...
[ 265.351553][ T183] BUG: Bad page state in process a.out pfn:20d62
[ 265.353555][ T183] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x20d62
[ 265.355457][ T183] flags: 0x3ffff800000100(active|node=0|zone=0|lastcpupid=0x7ffff)
[ 265.358719][ T183] raw: 003ffff800000100 0000000000000000 5deadbeef0000122 0000000000000000
[ 265.360177][ T183] raw: 0000000000000000 c0000000119caf58 00000000ffffffff 0000000000000000
[ 265.361438][ T183] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 265.362572][ T183] Modules linked in:
[ 265.364622][ T183] CPU: 0 UID: 0 PID: 183 Comm: a.out Not tainted 6.18.0-rc3-00141-g1ddeaaace7ff-dirty #53 VOLUNTARY
[ 265.364785][ T183] Hardware name: IBM pSeries (emulated by qemu) POWER10 (architected) 0x801200 0xf000006 of:SLOF,git-ee03ae pSeries
[ 265.364908][ T183] Call Trace:
[ 265.364955][ T183] [c000000011e6f7c0] [c000000001cfaa18] dump_stack_lvl+0x130/0x148 (unreliable)
[ 265.365202][ T183] [c000000011e6f7f0] [c000000000794758] bad_page+0xb4/0x1c8
[ 265.365384][ T183] [c000000011e6f890] [c00000000079c020] __free_frozen_pages+0x838/0xd08
[ 265.365554][ T183] [c000000011e6f980] [c0000000000a70ac] pte_frag_destroy+0x298/0x310
[ 265.365729][ T183] [c000000011e6fa30] [c0000000000aa764] arch_exit_mmap+0x34/0x218
[ 265.365912][ T183] [c000000011e6fa80] [c000000000751698] exit_mmap+0xb8/0x820
[ 265.366080][ T183] [c000000011e6fc30] [c0000000001b1258] __mmput+0x98/0x300
[ 265.366244][ T183] [c000000011e6fc80] [c0000000001c81f8] do_exit+0x470/0x1508
[ 265.366421][ T183] [c000000011e6fd70] [c0000000001c95e4] do_group_exit+0x88/0x148
[ 265.366602][ T183] [c000000011e6fdc0] [c0000000001c96ec] pid_child_should_wake+0x0/0x178
[ 265.366780][ T183] [c000000011e6fdf0] [c00000000003a270] system_call_exception+0x1b0/0x4e0
[ 265.366958][ T183] [c000000011e6fe50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
The bad page state error occurs when such a folio gets freed (with
active flag set), from do_exit() path in parallel.
... this can happen when the pte fragment was allocated from this folio,
but when all the fragments get freed, the pte_frag_refcount still had some
unused fragments. Now, if this process exits, with such folio as it's cached
pte_frag in mm->context, then during pte_frag_destroy(), we simply call
pagetable_dtor() and pagetable_free(), meaning it doesn't clear the
active flag. This, can lead to the above bug. Since we are anyway in
do_exit() path, then if the refcount is 0, then I guess it should be
ok to simply clear the folio active flag before calling pagetable_dtor()
& pagetable_free().
Fixes: 32cc0b7c9d ("powerpc: add pte_free_defer() for pgtables sharing page")
Reviewed-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/ee13e7f99b8f258019da2b37655b998e73e5ef8b.1773078178.git.ritesh.list@gmail.com
143 lines
3.2 KiB
C
143 lines
3.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
* Handling Page Tables through page fragments
|
|
*
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/hardirq.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/tlb.h>
|
|
|
|
void pte_frag_destroy(void *pte_frag)
|
|
{
|
|
int count;
|
|
struct ptdesc *ptdesc;
|
|
|
|
ptdesc = virt_to_ptdesc(pte_frag);
|
|
/* drop all the pending references */
|
|
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
|
|
/* We allow PTE_FRAG_NR fragments from a PTE page */
|
|
if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
|
|
folio_clear_active(ptdesc_folio(ptdesc));
|
|
pagetable_dtor(ptdesc);
|
|
pagetable_free(ptdesc);
|
|
}
|
|
}
|
|
|
|
static pte_t *get_pte_from_cache(struct mm_struct *mm)
|
|
{
|
|
void *pte_frag, *ret;
|
|
|
|
if (PTE_FRAG_NR == 1)
|
|
return NULL;
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
ret = pte_frag_get(&mm->context);
|
|
if (ret) {
|
|
pte_frag = ret + PTE_FRAG_SIZE;
|
|
/*
|
|
* If we have taken up all the fragments mark PTE page NULL
|
|
*/
|
|
if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
|
|
pte_frag = NULL;
|
|
pte_frag_set(&mm->context, pte_frag);
|
|
}
|
|
spin_unlock(&mm->page_table_lock);
|
|
return (pte_t *)ret;
|
|
}
|
|
|
|
static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
|
|
{
|
|
void *ret = NULL;
|
|
struct ptdesc *ptdesc;
|
|
gfp_t gfp = PGALLOC_GFP;
|
|
|
|
if (!kernel)
|
|
gfp |= __GFP_ACCOUNT;
|
|
|
|
ptdesc = pagetable_alloc(gfp, 0);
|
|
if (!ptdesc)
|
|
return NULL;
|
|
if (!pagetable_pte_ctor(mm, ptdesc)) {
|
|
pagetable_free(ptdesc);
|
|
return NULL;
|
|
}
|
|
|
|
atomic_set(&ptdesc->pt_frag_refcount, 1);
|
|
|
|
ret = ptdesc_address(ptdesc);
|
|
/*
|
|
* if we support only one fragment just return the
|
|
* allocated page.
|
|
*/
|
|
if (PTE_FRAG_NR == 1)
|
|
return ret;
|
|
spin_lock(&mm->page_table_lock);
|
|
/*
|
|
* If we find ptdesc_page set, we return
|
|
* the allocated page with single fragment
|
|
* count.
|
|
*/
|
|
if (likely(!pte_frag_get(&mm->context))) {
|
|
atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR);
|
|
pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE);
|
|
}
|
|
spin_unlock(&mm->page_table_lock);
|
|
|
|
return (pte_t *)ret;
|
|
}
|
|
|
|
pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel)
|
|
{
|
|
pte_t *pte;
|
|
|
|
pte = get_pte_from_cache(mm);
|
|
if (pte)
|
|
return pte;
|
|
|
|
return __alloc_for_ptecache(mm, kernel);
|
|
}
|
|
|
|
static void pte_free_now(struct rcu_head *head)
|
|
{
|
|
struct ptdesc *ptdesc;
|
|
|
|
ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
|
|
pagetable_dtor(ptdesc);
|
|
pagetable_free(ptdesc);
|
|
}
|
|
|
|
void pte_fragment_free(unsigned long *table, int kernel)
|
|
{
|
|
struct ptdesc *ptdesc = virt_to_ptdesc(table);
|
|
|
|
if (pagetable_is_reserved(ptdesc))
|
|
return free_reserved_ptdesc(ptdesc);
|
|
|
|
BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
|
|
if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
|
|
if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc)))
|
|
pte_free_now(&ptdesc->pt_rcu_head);
|
|
else
|
|
call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
|
|
}
|
|
}
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
|
|
{
|
|
struct folio *folio;
|
|
|
|
folio = virt_to_folio(pgtable);
|
|
folio_set_active(folio);
|
|
pte_fragment_free((unsigned long *)pgtable, 0);
|
|
}
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|