mm, swap: use the swap table to track the swap count

Now all the infrastructures are ready, switch to using the swap table
only.  This is unfortunately a large patch because the whole old counting
mechanism, especially SWP_CONTINUED, has to be gone and switch to the new
mechanism together, with no intermediate steps available.

The swap table is capable of holding up to SWP_TB_COUNT_MAX - 1 counts in
the higher bits of each table entry, so using that, the swap_map can be
completely dropped.

swap_map also had a limit of SWAP_CONT_MAX.  Any value beyond that limit
will require a COUNT_CONTINUED page.  COUNT_CONTINUED is a bit complex to
maintain, so for the swap table, a simpler approach is used: when the
count goes beyond SWP_TB_COUNT_MAX - 1, the cluster will have an
extend_table allocated, which is a swap cluster-sized array of unsigned
int.  The counting is basically offloaded there until the count drops
below SWP_TB_COUNT_MAX again.

Both the swap table and the extend table are cluster-based, so they
exhibit good performance and sparsity.

To make the switch from swap_map to swap table clean, this commit cleans
up and introduces a new set of functions based on the swap table design,
for manipulating swap counts:

- __swap_cluster_dup_entry, __swap_cluster_put_entry,
  __swap_cluster_alloc_entry, __swap_cluster_free_entry:

  Increase/decrease the count of a swap slot, or alloc / free a swap
  slot. This is the internal routine that does the counting work based
  on the swap table and handles all the complexities. The caller will
  need to lock the cluster before calling them.

  All swap count-related update operations are wrapped by these four
  helpers.

- swap_dup_entries_cluster, swap_put_entries_cluster:

  Increase/decrease the swap count of one or a set of swap slots in the
  same cluster range. These two helpers serve as the common routines for
  folio_dup_swap & swap_dup_entry_direct, or
  folio_put_swap & swap_put_entries_direct.

And use these helpers to replace all existing callers. This helps to
simplify the count tracking by a lot, and the swap_map is gone.

[ryncsn@gmail.com: fix build]
  Link: https://lkml.kernel.org/r/aZWuLZi-vYi3vAWe@KASONG-MC4
Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-9-f4e34be021a7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Kairui Song
2026-02-18 04:06:34 +08:00
committed by Andrew Morton
parent 5dc533f7aa
commit 0d6af9bcf3
6 changed files with 336 additions and 560 deletions

View File

@@ -208,7 +208,6 @@ enum {
SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */
SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
SWP_BLKDEV = (1 << 6), /* its a block device */
SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */
@@ -223,16 +222,6 @@ enum {
#define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
/* Bit flag in swap_map */
#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */
/* Special value in first swap_map */
#define SWAP_MAP_MAX 0x3e /* Max count */
#define SWAP_MAP_BAD 0x3f /* Note page is bad */
/* Special value in each swap_map continuation */
#define SWAP_CONT_MAX 0x7f /* Max count */
/*
* The first page in the swap file is the swap header, which is always marked
* bad to prevent it from being allocated as an entry. This also prevents the
@@ -264,8 +253,7 @@ struct swap_info_struct {
signed short prio; /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */
signed char type; /* strange name for an index */
unsigned int max; /* extent of the swap_map */
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
unsigned int max; /* size of this swap device */
unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */
struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
struct list_head free_clusters; /* free clusters list */
@@ -284,18 +272,14 @@ struct swap_info_struct {
struct completion comp; /* seldom referenced */
spinlock_t lock; /*
* protect map scan related fields like
* swap_map, inuse_pages and all cluster
* lists. other fields are only changed
* inuse_pages and all cluster lists.
* Other fields are only changed
* at swapon/swapoff, so are protected
* by swap_lock. changing flags need
* hold this lock and swap_lock. If
* both locks need hold, hold swap_lock
* first.
*/
spinlock_t cont_lock; /*
* protect swap count continuation page
* list.
*/
struct work_struct discard_work; /* discard worker */
struct work_struct reclaim_work; /* reclaim worker */
struct list_head discard_clusters; /* discard clusters list */
@@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void)
}
extern void si_swapinfo(struct sysinfo *);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
@@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio)
{
}
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
return 0;
}
static inline int swap_dup_entry_direct(swp_entry_t ent)
{
return 0;

View File

@@ -1346,7 +1346,7 @@ again:
if (ret == -EIO) {
VM_WARN_ON_ONCE(!entry.val);
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) {
ret = -ENOMEM;
goto out;
}

View File

@@ -37,6 +37,7 @@ struct swap_cluster_info {
u8 flags;
u8 order;
atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */
unsigned int *extend_table; /* For large swap count, protected by ci->lock */
struct list_head list;
};
@@ -183,6 +184,8 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
spin_unlock_irq(&ci->lock);
}
extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp);
/*
* Below are the core routines for doing swap for a folio.
* All helpers requires the folio to be locked, and a locked folio
@@ -206,9 +209,9 @@ int folio_dup_swap(struct folio *folio, struct page *subpage);
void folio_put_swap(struct folio *folio, struct page *subpage);
/* For internal use */
extern void swap_entries_free(struct swap_info_struct *si,
struct swap_cluster_info *ci,
unsigned long offset, unsigned int nr_pages);
extern void __swap_cluster_free_entries(struct swap_info_struct *si,
struct swap_cluster_info *ci,
unsigned int ci_off, unsigned int nr_pages);
/* linux/mm/page_io.c */
int sio_pool_init(void);
@@ -446,6 +449,11 @@ static inline int swap_writeout(struct folio *folio,
return 0;
}
static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
{
return -EINVAL;
}
static inline bool swap_cache_has_folio(swp_entry_t entry)
{
return false;

View File

@@ -140,21 +140,20 @@ void *swap_cache_get_shadow(swp_entry_t entry)
void __swap_cache_add_folio(struct swap_cluster_info *ci,
struct folio *folio, swp_entry_t entry)
{
unsigned long new_tb;
unsigned int ci_start, ci_off, ci_end;
unsigned int ci_off = swp_cluster_offset(entry), ci_end;
unsigned long nr_pages = folio_nr_pages(folio);
unsigned long pfn = folio_pfn(folio);
unsigned long old_tb;
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
new_tb = folio_to_swp_tb(folio, 0);
ci_start = swp_cluster_offset(entry);
ci_off = ci_start;
ci_end = ci_start + nr_pages;
ci_end = ci_off + nr_pages;
do {
VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off)));
__swap_table_set(ci, ci_off, new_tb);
old_tb = __swap_table_get(ci, ci_off);
VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb));
__swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
} while (++ci_off < ci_end);
folio_ref_add(folio, nr_pages);
@@ -183,14 +182,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
unsigned long old_tb;
struct swap_info_struct *si;
struct swap_cluster_info *ci;
unsigned int ci_start, ci_off, ci_end, offset;
unsigned int ci_start, ci_off, ci_end;
unsigned long nr_pages = folio_nr_pages(folio);
si = __swap_entry_to_info(entry);
ci_start = swp_cluster_offset(entry);
ci_end = ci_start + nr_pages;
ci_off = ci_start;
offset = swp_offset(entry);
ci = swap_cluster_lock(si, swp_offset(entry));
if (unlikely(!ci->table)) {
err = -ENOENT;
@@ -202,13 +200,12 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
err = -EEXIST;
goto failed;
}
if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) {
if (unlikely(!__swp_tb_get_count(old_tb))) {
err = -ENOENT;
goto failed;
}
if (swp_tb_is_shadow(old_tb))
shadow = swp_tb_to_shadow(old_tb);
offset++;
} while (++ci_off < ci_end);
__swap_cache_add_folio(ci, folio, entry);
swap_cluster_unlock(ci);
@@ -237,8 +234,9 @@ failed:
void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
swp_entry_t entry, void *shadow)
{
int count;
unsigned long old_tb;
struct swap_info_struct *si;
unsigned long old_tb, new_tb;
unsigned int ci_start, ci_off, ci_end;
bool folio_swapped = false, need_free = false;
unsigned long nr_pages = folio_nr_pages(folio);
@@ -249,20 +247,20 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
si = __swap_entry_to_info(entry);
new_tb = shadow_to_swp_tb(shadow, 0);
ci_start = swp_cluster_offset(entry);
ci_end = ci_start + nr_pages;
ci_off = ci_start;
do {
/* If shadow is NULL, we sets an empty shadow */
old_tb = __swap_table_xchg(ci, ci_off, new_tb);
old_tb = __swap_table_get(ci, ci_off);
WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
swp_tb_to_folio(old_tb) != folio);
if (__swap_count(swp_entry(si->type,
swp_offset(entry) + ci_off - ci_start)))
count = __swp_tb_get_count(old_tb);
if (count)
folio_swapped = true;
else
need_free = true;
/* If shadow is NULL, we sets an empty shadow. */
__swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count));
} while (++ci_off < ci_end);
folio->swap.val = 0;
@@ -271,13 +269,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
if (!folio_swapped) {
swap_entries_free(si, ci, swp_offset(entry), nr_pages);
__swap_cluster_free_entries(si, ci, ci_start, nr_pages);
} else if (need_free) {
ci_off = ci_start;
do {
if (!__swap_count(entry))
swap_entries_free(si, ci, swp_offset(entry), 1);
entry.val++;
} while (--nr_pages);
if (!__swp_tb_get_count(__swap_table_get(ci, ci_off)))
__swap_cluster_free_entries(si, ci, ci_off, 1);
} while (++ci_off < ci_end);
}
}
@@ -324,17 +322,18 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
unsigned long nr_pages = folio_nr_pages(new);
unsigned int ci_off = swp_cluster_offset(entry);
unsigned int ci_end = ci_off + nr_pages;
unsigned long old_tb, new_tb;
unsigned long pfn = folio_pfn(new);
unsigned long old_tb;
VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
VM_WARN_ON_ONCE(!entry.val);
/* Swap cache still stores N entries instead of a high-order entry */
new_tb = folio_to_swp_tb(new, 0);
do {
old_tb = __swap_table_xchg(ci, ci_off, new_tb);
old_tb = __swap_table_get(ci, ci_off);
WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
__swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
} while (++ci_off < ci_end);
/*
@@ -368,7 +367,7 @@ void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
ci_end = ci_off + nr_ents;
do {
old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
WARN_ON_ONCE(swp_tb_is_folio(old));
WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old));
} while (++ci_off < ci_end);
}

View File

@@ -191,6 +191,11 @@ static inline int swp_tb_get_count(unsigned long swp_tb)
return -EINVAL;
}
static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count)
{
return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count));
}
/*
* Helpers for accessing or modifying the swap table of a cluster,
* the swap cluster must be locked.

File diff suppressed because it is too large Load Diff