mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
mm/mglru: rework workingset protection
With the aging feedback no longer considering the distribution of folios
in each generation, rework workingset protection to better distribute
folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and
PG_referenced/LRU_REFS_FLAGS in a slightly different way.
For folios accessed multiple times through file descriptors, make
lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags
after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all
its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is
lazily promoted into the second oldest generation in the eviction path.
And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only
valid when PG_referenced is set.
For folios accessed multiple times through page tables, folio_update_gen()
from a page table walk or lru_gen_set_refs() from a rmap walk sets
PG_referenced after the accessed bit is cleared for the first time.
Thereafter, those two paths set PG_workingset and promote folios to the
youngest generation. Like folio_inc_gen(), when folio_update_gen() does
that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not
used.
For both of the cases, after PG_workingset is set on a folio, it remains
until this folio is either reclaimed, or "deactivated" by
lru_gen_clear_refs(). It can be set again if lru_gen_test_recent()
returns true upon a refault.
When adding folios to the LRU lists, lru_gen_folio_seq() distributes
them as follows:
+---------------------------------+---------------------------------+
| Accessed thru page tables | Accessed thru file descriptors |
+---------------------------------+---------------------------------+
| PG_active (set while isolated) | |
+----------------+----------------+----------------+----------------+
| PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS |
+---------------------------------+---------------------------------+
|<--------- MIN_NR_GENS --------->| |
|<-------------------------- MAX_NR_GENS -------------------------->|
After this patch, some typical client and server workloads showed
improvements under heavy memory pressure. For example, Python TPC-C,
which was used to benchmark a different approach [1] to better detect
refault distances, showed a significant decrease in total refaults:
Before After Change
Time (seconds) 10801 10801 0%
Executed (transactions) 41472 43663 +5%
workingset_nodes 109070 120244 +10%
workingset_refault_anon 5019627 7281831 +45%
workingset_refault_file 1294678786 554855564 -57%
workingset_refault_total 1299698413 562137395 -57%
[1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/
Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Kairui Song <kasong@tencent.com>
Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/
Tested-by: Kalesh Singh <kaleshsingh@google.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: David Stevens <stevensd@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
@@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq)
|
||||
return seq % NR_HIST_GENS;
|
||||
}
|
||||
|
||||
static inline int lru_tier_from_refs(int refs)
|
||||
static inline int lru_tier_from_refs(int refs, bool workingset)
|
||||
{
|
||||
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
|
||||
|
||||
/* see the comment in folio_lru_refs() */
|
||||
return order_base_2(refs + 1);
|
||||
/* see the comment on MAX_NR_TIERS */
|
||||
return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
|
||||
}
|
||||
|
||||
static inline int folio_lru_refs(struct folio *folio)
|
||||
{
|
||||
unsigned long flags = READ_ONCE(folio->flags);
|
||||
bool workingset = flags & BIT(PG_workingset);
|
||||
|
||||
if (!(flags & BIT(PG_referenced)))
|
||||
return 0;
|
||||
/*
|
||||
* Return the number of accesses beyond PG_referenced, i.e., N-1 if the
|
||||
* total number of accesses is N>1, since N=0,1 both map to the first
|
||||
* tier. lru_tier_from_refs() will account for this off-by-one. Also see
|
||||
* the comment on MAX_NR_TIERS.
|
||||
* Return the total number of accesses including PG_referenced. Also see
|
||||
* the comment on LRU_REFS_FLAGS.
|
||||
*/
|
||||
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
|
||||
}
|
||||
|
||||
static inline void folio_clear_lru_refs(struct folio *folio)
|
||||
{
|
||||
set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
|
||||
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
|
||||
}
|
||||
|
||||
static inline int folio_lru_gen(struct folio *folio)
|
||||
@@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
|
||||
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
|
||||
}
|
||||
|
||||
static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio,
|
||||
bool reclaiming)
|
||||
{
|
||||
int gen;
|
||||
int type = folio_is_file_lru(folio);
|
||||
struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
||||
|
||||
/*
|
||||
* +-----------------------------------+-----------------------------------+
|
||||
* | Accessed through page tables and | Accessed through file descriptors |
|
||||
* | promoted by folio_update_gen() | and protected by folio_inc_gen() |
|
||||
* +-----------------------------------+-----------------------------------+
|
||||
* | PG_active (set while isolated) | |
|
||||
* +-----------------+-----------------+-----------------+-----------------+
|
||||
* | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS |
|
||||
* +-----------------------------------+-----------------------------------+
|
||||
* |<---------- MIN_NR_GENS ---------->| |
|
||||
* |<---------------------------- MAX_NR_GENS ---------------------------->|
|
||||
*/
|
||||
if (folio_test_active(folio))
|
||||
gen = MIN_NR_GENS - folio_test_workingset(folio);
|
||||
else if (reclaiming)
|
||||
gen = MAX_NR_GENS;
|
||||
else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) ||
|
||||
(folio_test_reclaim(folio) &&
|
||||
(folio_test_dirty(folio) || folio_test_writeback(folio))))
|
||||
gen = MIN_NR_GENS;
|
||||
else
|
||||
gen = MAX_NR_GENS - folio_test_workingset(folio);
|
||||
|
||||
return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
|
||||
}
|
||||
|
||||
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||
{
|
||||
unsigned long seq;
|
||||
unsigned long flags;
|
||||
unsigned long mask;
|
||||
int gen = folio_lru_gen(folio);
|
||||
int type = folio_is_file_lru(folio);
|
||||
int zone = folio_zonenum(folio);
|
||||
@@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
|
||||
|
||||
if (folio_test_unevictable(folio) || !lrugen->enabled)
|
||||
return false;
|
||||
/*
|
||||
* There are four common cases for this page:
|
||||
* 1. If it's hot, i.e., freshly faulted in, add it to the youngest
|
||||
* generation, and it's protected over the rest below.
|
||||
* 2. If it can't be evicted immediately, i.e., a dirty page pending
|
||||
* writeback, add it to the second youngest generation.
|
||||
* 3. If it should be evicted first, e.g., cold and clean from
|
||||
* folio_rotate_reclaimable(), add it to the oldest generation.
|
||||
* 4. Everything else falls between 2 & 3 above and is added to the
|
||||
* second oldest generation if it's considered inactive, or the
|
||||
* oldest generation otherwise. See lru_gen_is_active().
|
||||
*/
|
||||
if (folio_test_active(folio))
|
||||
seq = lrugen->max_seq;
|
||||
else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
|
||||
(folio_test_reclaim(folio) &&
|
||||
(folio_test_dirty(folio) || folio_test_writeback(folio))))
|
||||
seq = lrugen->max_seq - 1;
|
||||
else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
|
||||
seq = lrugen->min_seq[type];
|
||||
else
|
||||
seq = lrugen->min_seq[type] + 1;
|
||||
|
||||
seq = lru_gen_folio_seq(lruvec, folio, reclaiming);
|
||||
gen = lru_gen_from_seq(seq);
|
||||
flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
/* see the comment on MIN_NR_GENS about PG_active */
|
||||
mask = LRU_GEN_MASK;
|
||||
/*
|
||||
* Don't clear PG_workingset here because it can affect PSI accounting
|
||||
* if the activation is due to workingset refault.
|
||||
*/
|
||||
if (folio_test_active(folio))
|
||||
mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active);
|
||||
set_mask_bits(&folio->flags, mask, flags);
|
||||
set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
||||
|
||||
lru_gen_update_size(lruvec, folio, -1, gen);
|
||||
/* for folio_rotate_reclaimable() */
|
||||
|
||||
Reference in New Issue
Block a user