mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Currently, the buddy system only performs checks every 3rd sample. With a 4-second interval. If a check window is missed, the next check occurs 12 seconds later, potentially delaying hard lockup detection for up to 24 seconds. Modify the buddy system to perform checks at every interval (4s). Introduce a missed-interrupt threshold to maintain the existing grace period while reducing the detection window to 8-12 seconds. Best and worst case detection scenarios: Before (12s check window): - Best case: Lockup occurs after first check but just before heartbeat interval. Detected in ~8s (8s till next check). - Worst case: Lockup occurs just after a check. Detected in ~24s (missed check + 12s till next check + 12s logic). After (4s check window with threshold of 3): - Best case: Lockup occurs just before a check. Detected in ~8s (0s till 1st check + 4s till 2nd + 4s till 3rd). - Worst case: Lockup occurs just after a check. Detected in ~12s (4s till 1st check + 4s till 2nd + 4s till 3rd). Link: https://lkml.kernel.org/r/20260312-hardlockup-watchdog-fixes-v2-4-45bd8a0cc7ed@google.com Signed-off-by: Mayank Rungta <mrungta@google.com> Reviewed-by: Douglas Anderson <dianders@chromium.org> Reviewed-by: Petr Mladek <pmladek@suse.com> Cc: Ian Rogers <irogers@google.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Li Huafei <lihuafei1@huawei.com> Cc: Max Kellermann <max.kellermann@ionos.com> Cc: Shuah Khan <skhan@linuxfoundation.org> Cc: Stephane Erainan <eranian@google.com> Cc: Wang Jinchao <wangjinchao600@gmail.com> Cc: Yunhui Cui <cuiyunhui@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
104 lines
2.8 KiB
C
104 lines
2.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/cpu.h>
|
|
#include <linux/cpumask.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/nmi.h>
|
|
#include <linux/percpu-defs.h>
|
|
|
|
static cpumask_t __read_mostly watchdog_cpus;
|
|
|
|
static unsigned int watchdog_next_cpu(unsigned int cpu)
|
|
{
|
|
unsigned int next_cpu;
|
|
|
|
next_cpu = cpumask_next_wrap(cpu, &watchdog_cpus);
|
|
if (next_cpu == cpu)
|
|
return nr_cpu_ids;
|
|
|
|
return next_cpu;
|
|
}
|
|
|
|
int __init watchdog_hardlockup_probe(void)
|
|
{
|
|
watchdog_hardlockup_miss_thresh = 3;
|
|
return 0;
|
|
}
|
|
|
|
void watchdog_hardlockup_enable(unsigned int cpu)
|
|
{
|
|
unsigned int next_cpu;
|
|
|
|
/*
|
|
* The new CPU will be marked online before the hrtimer interrupt
|
|
* gets a chance to run on it. If another CPU tests for a
|
|
* hardlockup on the new CPU before it has run its the hrtimer
|
|
* interrupt, it will get a false positive. Touch the watchdog on
|
|
* the new CPU to delay the check for at least 3 sampling periods
|
|
* to guarantee one hrtimer has run on the new CPU.
|
|
*/
|
|
watchdog_hardlockup_touch_cpu(cpu);
|
|
|
|
/*
|
|
* We are going to check the next CPU. Our watchdog_hrtimer
|
|
* need not be zero if the CPU has already been online earlier.
|
|
* Touch the watchdog on the next CPU to avoid false positive
|
|
* if we try to check it in less then 3 interrupts.
|
|
*/
|
|
next_cpu = watchdog_next_cpu(cpu);
|
|
if (next_cpu < nr_cpu_ids)
|
|
watchdog_hardlockup_touch_cpu(next_cpu);
|
|
|
|
/*
|
|
* Makes sure that watchdog is touched on this CPU before
|
|
* other CPUs could see it in watchdog_cpus. The counter
|
|
* part is in watchdog_buddy_check_hardlockup().
|
|
*/
|
|
smp_wmb();
|
|
|
|
cpumask_set_cpu(cpu, &watchdog_cpus);
|
|
}
|
|
|
|
void watchdog_hardlockup_disable(unsigned int cpu)
|
|
{
|
|
unsigned int next_cpu = watchdog_next_cpu(cpu);
|
|
|
|
/*
|
|
* Offlining this CPU will cause the CPU before this one to start
|
|
* checking the one after this one. If this CPU just finished checking
|
|
* the next CPU and updating hrtimer_interrupts_saved, and then the
|
|
* previous CPU checks it within one sample period, it will trigger a
|
|
* false positive. Touch the watchdog on the next CPU to prevent it.
|
|
*/
|
|
if (next_cpu < nr_cpu_ids)
|
|
watchdog_hardlockup_touch_cpu(next_cpu);
|
|
|
|
/*
|
|
* Makes sure that watchdog is touched on the next CPU before
|
|
* this CPU disappear in watchdog_cpus. The counter part is in
|
|
* watchdog_buddy_check_hardlockup().
|
|
*/
|
|
smp_wmb();
|
|
|
|
cpumask_clear_cpu(cpu, &watchdog_cpus);
|
|
}
|
|
|
|
void watchdog_buddy_check_hardlockup(int hrtimer_interrupts)
|
|
{
|
|
unsigned int next_cpu;
|
|
|
|
/* check for a hardlockup on the next CPU */
|
|
next_cpu = watchdog_next_cpu(smp_processor_id());
|
|
if (next_cpu >= nr_cpu_ids)
|
|
return;
|
|
|
|
/*
|
|
* Make sure that the watchdog was touched on next CPU when
|
|
* watchdog_next_cpu() returned another one because of
|
|
* a change in watchdog_hardlockup_enable()/disable().
|
|
*/
|
|
smp_rmb();
|
|
|
|
watchdog_hardlockup_check(next_cpu, NULL);
|
|
}
|