Merge tag 'wq-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue updates from Tejun Heo:

 - Rework the rescuer to process work items one-by-one instead of
   slurping all pending work items in a single pass.

   As there is only one rescuer per workqueue, a single long-blocking
   work item could cause high latency for all tasks queued behind it,
   even after memory pressure is relieved and regular kworkers become
   available to service them.

 - Add CONFIG_BOOTPARAM_WQ_STALL_PANIC build-time option and
   workqueue.panic_on_stall_time parameter for time-based stall panic,
   giving systems more control over workqueue stall handling.

 - Replace BUG_ON() with panic() in the stall panic path for clearer
   intent and more informative output.

* tag 'wq-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
  workqueue: replace BUG_ON with panic in panic_on_wq_watchdog
  workqueue: add time-based panic for stalls
  workqueue: add CONFIG_BOOTPARAM_WQ_STALL_PANIC option
  workqueue: Process extra works in rescuer on memory pressure
  workqueue: Process rescuer work items one-by-one using a cursor
  workqueue: Make send_mayday() take a PWQ argument directly
This commit is contained in:
Linus Torvalds
2026-02-11 13:13:32 -08:00
3 changed files with 149 additions and 36 deletions

View File

@@ -117,6 +117,8 @@ enum wq_internal_consts {
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
RESCUER_BATCH = 16, /* process items per turn */
/*
* Rescue workers are used only on emergencies and shared by
* all cpus. Give MIN_NICE.
@@ -286,6 +288,7 @@ struct pool_workqueue {
struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
struct work_struct mayday_cursor; /* L: cursor on pool->worklist */
u64 stats[PWQ_NR_STATS];
@@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
return NULL;
}
static void mayday_cursor_func(struct work_struct *work)
{
/* should not be processed, only for marking position */
BUG();
}
/**
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker,
lockdep_assert_held(&pool->lock);
/* The cursor work should not be processed */
if (unlikely(work->func == mayday_cursor_func)) {
/* only worker_thread() can possibly take this branch */
WARN_ON_ONCE(worker->rescue_wq);
if (nextp)
*nextp = list_next_entry(work, entry);
list_del_init(&work->entry);
return false;
}
/*
* A single work shouldn't be executed concurrently by multiple workers.
* __queue_work() ensures that @work doesn't jump to a different pool
@@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
reap_dying_workers(&cull_list);
}
static void send_mayday(struct work_struct *work)
static void send_mayday(struct pool_workqueue *pwq)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct workqueue_struct *wq = pwq->wq;
lockdep_assert_held(&wq_mayday_lock);
@@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
* rescuers.
*/
list_for_each_entry(work, &pool->worklist, entry)
send_mayday(work);
send_mayday(get_work_pwq(work));
}
raw_spin_unlock(&wq_mayday_lock);
@@ -3440,22 +3458,57 @@ sleep:
static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer)
{
struct worker_pool *pool = pwq->pool;
struct work_struct *cursor = &pwq->mayday_cursor;
struct work_struct *work, *n;
/* need rescue? */
if (!pwq->nr_active || !need_to_create_worker(pool))
/* have work items to rescue? */
if (!pwq->nr_active)
return false;
/*
* Slurp in all works issued via this workqueue and
* process'em.
*/
list_for_each_entry_safe(work, n, &pool->worklist, entry) {
if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
pwq->stats[PWQ_STAT_RESCUED]++;
/* need rescue? */
if (!need_to_create_worker(pool)) {
/*
* The pool has idle workers and doesn't need the rescuer, so it
* could simply return false here.
*
* However, the memory pressure might not be fully relieved.
* In PERCPU pool with concurrency enabled, having idle workers
* does not necessarily mean memory pressure is gone; it may
* simply mean regular workers have woken up, completed their
* work, and gone idle again due to concurrency limits.
*
* In this case, those working workers may later sleep again,
* the pool may run out of idle workers, and it will have to
* allocate new ones and wait for the timer to send mayday,
* causing unnecessary delay - especially if memory pressure
* was never resolved throughout.
*
* Do more work if memory pressure is still on to reduce
* relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
* not precisely, unless there are other PWQs needing help.
*/
if (!(pool->flags & POOL_MANAGER_ACTIVE) ||
!list_empty(&pwq->wq->maydays))
return false;
}
return !list_empty(&rescuer->scheduled);
/* search from the start or cursor if available */
if (list_empty(&cursor->entry))
work = list_first_entry(&pool->worklist, struct work_struct, entry);
else
work = list_next_entry(cursor, entry);
/* find the next work item to rescue */
list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
pwq->stats[PWQ_STAT_RESCUED]++;
/* put the cursor for next search */
list_move_tail(&cursor->entry, &n->entry);
return true;
}
}
return false;
}
/**
@@ -3512,6 +3565,7 @@ repeat:
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
unsigned int count = 0;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -3524,31 +3578,27 @@ repeat:
WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
if (assign_rescuer_work(pwq, rescuer)) {
while (assign_rescuer_work(pwq, rescuer)) {
process_scheduled_works(rescuer);
/*
* The above execution of rescued work items could
* have created more to rescue through
* pwq_activate_first_inactive() or chained
* queueing. Let's put @pwq back on mayday list so
* that such back-to-back work items, which may be
* being used to relieve memory pressure, don't
* incur MAYDAY_INTERVAL delay inbetween.
* If the per-turn work item limit is reached and other
* PWQs are in mayday, requeue mayday for this PWQ and
* let the rescuer handle the other PWQs first.
*/
if (pwq->nr_active && need_to_create_worker(pool)) {
if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
pwq->nr_active && need_to_create_worker(pool)) {
raw_spin_lock(&wq_mayday_lock);
/*
* Queue iff somebody else hasn't queued it already.
*/
if (list_empty(&pwq->mayday_node)) {
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
}
send_mayday(pwq);
raw_spin_unlock(&wq_mayday_lock);
break;
}
}
/* The cursor can not be left behind without the rescuer watching it. */
if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
list_del_init(&pwq->mayday_cursor.entry);
/*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
@@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
kthread_init_work(&pwq->release_work, pwq_release_workfn);
/*
* Set the dummy cursor work with valid function and get_work_pwq().
*
* The cursor work should only be in the pwq->pool->worklist, and
* should not be treated as a processable work item.
*
* WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
* surprise for kernel debugging tools and reviewers.
*/
INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq |
WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE);
}
/* sync @pwq with the current state of its associated wq and link it */
@@ -7508,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
static unsigned int wq_panic_on_stall;
static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
static unsigned int wq_panic_on_stall_time;
module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
/*
* Show workers that might prevent the processing of pending work items.
* The only candidates are CPU-bound workers in the running state.
@@ -7562,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
rcu_read_unlock();
}
static void panic_on_wq_watchdog(void)
/*
* It triggers a panic in two scenarios: when the total number of stalls
* exceeds a threshold, and when a stall lasts longer than
* wq_panic_on_stall_time
*/
static void panic_on_wq_watchdog(unsigned int stall_time_sec)
{
static unsigned int wq_stall;
if (wq_panic_on_stall) {
wq_stall++;
BUG_ON(wq_stall >= wq_panic_on_stall);
if (wq_stall >= wq_panic_on_stall)
panic("workqueue: %u stall(s) exceeded threshold %u\n",
wq_stall, wq_panic_on_stall);
}
if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
panic("workqueue: stall lasted %us, exceeding threshold %us\n",
stall_time_sec, wq_panic_on_stall_time);
}
static void wq_watchdog_reset_touched(void)
@@ -7584,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
unsigned int max_stall_time = 0;
bool lockup_detected = false;
bool cpu_pool_stall = false;
unsigned long now = jiffies;
struct worker_pool *pool;
unsigned int stall_time;
int pi;
if (!thresh)
@@ -7621,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
/* did we stall? */
if (time_after(now, ts + thresh)) {
lockup_detected = true;
stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
max_stall_time = max(max_stall_time, stall_time);
if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
pool->cpu_stall = true;
cpu_pool_stall = true;
}
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
pr_cont(" stuck for %us!\n",
jiffies_to_msecs(now - pool_ts) / 1000);
pr_cont(" stuck for %us!\n", stall_time);
}
@@ -7641,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
show_cpu_pools_hogs();
if (lockup_detected)
panic_on_wq_watchdog();
panic_on_wq_watchdog(max_stall_time);
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);