mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
The clocksource watchdog code has over time reached the state of an
impenetrable maze of duct tape and staples. The original design, which was
made in the context of systems far smaller than today, is based on the
assumption that the to be monitored clocksource (TSC) can be trivially
compared against a known to be stable clocksource (HPET/ACPI-PM timer).
Over the years it turned out that this approach has major flaws:
- Long delays between watchdog invocations can result in wrap arounds
of the reference clocksource
- Scalability of the reference clocksource readout can degrade on large
multi-socket systems due to interconnect congestion
This was addressed with various heuristics which degraded the accuracy of
the watchdog to the point that it fails to detect actual TSC problems on
older hardware which exposes slow inter CPU drifts due to firmware
manipulating the TSC to hide SMI time.
To address this and bring back sanity to the watchdog, rewrite the code
completely with a different approach:
1) Restrict the validation against a reference clocksource to the boot
CPU, which is usually the CPU/Socket closest to the legacy block which
contains the reference source (HPET/ACPI-PM timer). Validate that the
reference readout is within a bound latency so that the actual
comparison against the TSC stays within 500ppm as long as the clocks
are stable.
2) Compare the TSCs of the other CPUs in a round robin fashion against
the boot CPU in the same way the TSC synchronization on CPU hotplug
works. This still can suffer from delayed reaction of the remote CPU
to the SMP function call and the latency of the control variable cache
line. But this latency is not affecting correctness. It only affects
the accuracy. With low contention the readout latency is in the low
nanoseconds range, which detects even slight skews between CPUs. Under
high contention this becomes obviously less accurate, but still
detects slow skews reliably as it solely relies on subsequent readouts
being monotonically increasing. It just can take slightly longer to
detect the issue.
3) Rewrite the watchdog test so it tests the various mechanisms one by
one and validating the result against the expectation.
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Daniel J Blueman <daniel@quora.org>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Reviewed-by: Daniel J Blueman <daniel@quora.org>
Link: https://patch.msgid.link/20260123231521.926490888@kernel.org
Link: https://patch.msgid.link/87h5qeomm5.ffs@tglx
229 lines
6.3 KiB
Plaintext
229 lines
6.3 KiB
Plaintext
# SPDX-License-Identifier: GPL-2.0-only
|
|
#
|
|
# Timer subsystem related configuration options
|
|
#
|
|
|
|
# Options selectable by arch Kconfig
|
|
|
|
# Watchdog function for clocksources to detect instabilities
|
|
config CLOCKSOURCE_WATCHDOG
|
|
bool
|
|
|
|
# Architecture has extra clocksource data
|
|
config ARCH_CLOCKSOURCE_DATA
|
|
bool
|
|
|
|
# Architecture has extra clocksource init called from registration
|
|
config ARCH_CLOCKSOURCE_INIT
|
|
bool
|
|
|
|
config ARCH_WANTS_CLOCKSOURCE_READ_INLINE
|
|
bool
|
|
|
|
# Timekeeping vsyscall support
|
|
config GENERIC_TIME_VSYSCALL
|
|
bool
|
|
|
|
# The generic clock events infrastructure
|
|
config GENERIC_CLOCKEVENTS
|
|
def_bool !LEGACY_TIMER_TICK
|
|
|
|
# Architecture can handle broadcast in a driver-agnostic way
|
|
config ARCH_HAS_TICK_BROADCAST
|
|
bool
|
|
|
|
# Clockevents broadcasting infrastructure
|
|
config GENERIC_CLOCKEVENTS_BROADCAST
|
|
bool
|
|
depends on GENERIC_CLOCKEVENTS
|
|
|
|
# Handle broadcast in default_idle_call()
|
|
config GENERIC_CLOCKEVENTS_BROADCAST_IDLE
|
|
bool
|
|
depends on GENERIC_CLOCKEVENTS_BROADCAST
|
|
|
|
# Automatically adjust the min. reprogramming time for
|
|
# clock event device
|
|
config GENERIC_CLOCKEVENTS_MIN_ADJUST
|
|
bool
|
|
|
|
config GENERIC_CLOCKEVENTS_COUPLED
|
|
bool
|
|
|
|
config GENERIC_CLOCKEVENTS_COUPLED_INLINE
|
|
select GENERIC_CLOCKEVENTS_COUPLED
|
|
bool
|
|
|
|
# Generic update of CMOS clock
|
|
config GENERIC_CMOS_UPDATE
|
|
bool
|
|
|
|
# Deferred rearming of the hrtimer interrupt
|
|
config HRTIMER_REARM_DEFERRED
|
|
def_bool y
|
|
depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS
|
|
depends on HIGH_RES_TIMERS && SCHED_HRTICK
|
|
|
|
# Select to handle posix CPU timers from task_work
|
|
# and not from the timer interrupt context
|
|
config HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
|
bool
|
|
|
|
config POSIX_CPU_TIMERS_TASK_WORK
|
|
bool
|
|
default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
|
|
|
config LEGACY_TIMER_TICK
|
|
bool
|
|
help
|
|
The legacy timer tick helper is used by platforms that
|
|
lack support for the generic clockevent framework.
|
|
New platforms should use generic clockevents instead.
|
|
|
|
config TIME_KUNIT_TEST
|
|
tristate "KUnit test for kernel/time functions" if !KUNIT_ALL_TESTS
|
|
depends on KUNIT
|
|
default KUNIT_ALL_TESTS
|
|
help
|
|
Enable this option to test RTC library functions.
|
|
|
|
If unsure, say N.
|
|
|
|
config CONTEXT_TRACKING
|
|
bool
|
|
|
|
config CONTEXT_TRACKING_IDLE
|
|
bool
|
|
select CONTEXT_TRACKING
|
|
help
|
|
Tracks idle state on behalf of RCU.
|
|
|
|
menu "Timers subsystem"
|
|
|
|
if GENERIC_CLOCKEVENTS
|
|
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
|
|
# only related to the tick functionality. Oneshot clockevent devices
|
|
# are supported independent of this.
|
|
config TICK_ONESHOT
|
|
bool
|
|
|
|
config NO_HZ_COMMON
|
|
bool
|
|
select TICK_ONESHOT
|
|
|
|
choice
|
|
prompt "Timer tick handling"
|
|
default NO_HZ_IDLE if NO_HZ
|
|
|
|
config HZ_PERIODIC
|
|
bool "Periodic timer ticks (constant rate, no dynticks)"
|
|
help
|
|
This option keeps the tick running periodically at a constant
|
|
rate, even when the CPU doesn't need it.
|
|
|
|
config NO_HZ_IDLE
|
|
bool "Idle dynticks system (tickless idle)"
|
|
select NO_HZ_COMMON
|
|
help
|
|
This option enables a tickless idle system: timer interrupts
|
|
will only trigger on an as-needed basis when the system is idle.
|
|
This is usually interesting for energy saving.
|
|
|
|
Most of the time you want to say Y here.
|
|
|
|
config NO_HZ_FULL
|
|
bool "Full dynticks system (tickless)"
|
|
# NO_HZ_COMMON dependency
|
|
# We need at least one periodic CPU for timekeeping
|
|
depends on SMP
|
|
depends on HAVE_CONTEXT_TRACKING_USER
|
|
# VIRT_CPU_ACCOUNTING_GEN dependency
|
|
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
|
|
select NO_HZ_COMMON
|
|
select RCU_NOCB_CPU
|
|
select VIRT_CPU_ACCOUNTING_GEN
|
|
select IRQ_WORK
|
|
select CPU_ISOLATION
|
|
help
|
|
Adaptively try to shutdown the tick whenever possible, even when
|
|
the CPU is running tasks. Typically this requires running a single
|
|
task on the CPU. Chances for running tickless are maximized when
|
|
the task mostly runs in userspace and has few kernel activity.
|
|
|
|
You need to fill up the nohz_full boot parameter with the
|
|
desired range of dynticks CPUs to use it. This is implemented at
|
|
the expense of some overhead in user <-> kernel transitions:
|
|
syscalls, exceptions and interrupts.
|
|
|
|
By default, without passing the nohz_full parameter, this behaves just
|
|
like NO_HZ_IDLE.
|
|
|
|
If you're a distro say Y.
|
|
|
|
endchoice
|
|
|
|
config CONTEXT_TRACKING_USER
|
|
bool
|
|
depends on HAVE_CONTEXT_TRACKING_USER
|
|
select CONTEXT_TRACKING
|
|
help
|
|
Track transitions between kernel and user on behalf of RCU and
|
|
tickless cputime accounting. The former case relies on context
|
|
tracking to enter/exit RCU extended quiescent states.
|
|
|
|
config CONTEXT_TRACKING_USER_FORCE
|
|
bool "Force user context tracking"
|
|
depends on CONTEXT_TRACKING_USER
|
|
default y if !NO_HZ_FULL
|
|
help
|
|
The major pre-requirement for full dynticks to work is to
|
|
support the user context tracking subsystem. But there are also
|
|
other dependencies to provide in order to make the full
|
|
dynticks working.
|
|
|
|
This option stands for testing when an arch implements the
|
|
user context tracking backend but doesn't yet fulfill all the
|
|
requirements to make the full dynticks feature working.
|
|
Without the full dynticks, there is no way to test the support
|
|
for user context tracking and the subsystems that rely on it: RCU
|
|
userspace extended quiescent state and tickless cputime
|
|
accounting. This option copes with the absence of the full
|
|
dynticks subsystem by forcing the user context tracking on all
|
|
CPUs in the system.
|
|
|
|
Say Y only if you're working on the development of an
|
|
architecture backend for the user context tracking.
|
|
|
|
Say N otherwise, this option brings an overhead that you
|
|
don't want in production.
|
|
|
|
config NO_HZ
|
|
bool "Old Idle dynticks config"
|
|
help
|
|
This is the old config entry that enables dynticks idle.
|
|
We keep it around for a little while to enforce backward
|
|
compatibility with older config files.
|
|
|
|
config HIGH_RES_TIMERS
|
|
bool "High Resolution Timer Support"
|
|
select TICK_ONESHOT
|
|
help
|
|
This option enables high resolution timer support. If your
|
|
hardware is not capable then this option only increases
|
|
the size of the kernel image.
|
|
|
|
endif
|
|
|
|
config POSIX_AUX_CLOCKS
|
|
bool "Enable auxiliary POSIX clocks"
|
|
depends on POSIX_TIMERS
|
|
help
|
|
Auxiliary POSIX clocks are clocks which can be steered
|
|
independently of the core timekeeper, which controls the
|
|
MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to
|
|
provide e.g. lockless time accessors to independent PTP clocks
|
|
and other clock domains, which are not correlated to the TAI/NTP
|
|
notion of time.
|
|
|
|
endmenu
|