Files
linux/drivers/clocksource/acpi_pm.c
Thomas Gleixner 763aacf86f clocksource: Rewrite watchdog code completely
The clocksource watchdog code has over time reached the state of an
impenetrable maze of duct tape and staples. The original design, which was
made in the context of systems far smaller than today, is based on the
assumption that the to be monitored clocksource (TSC) can be trivially
compared against a known to be stable clocksource (HPET/ACPI-PM timer).

Over the years it turned out that this approach has major flaws:

  - Long delays between watchdog invocations can result in wrap arounds
    of the reference clocksource

  - Scalability of the reference clocksource readout can degrade on large
    multi-socket systems due to interconnect congestion

This was addressed with various heuristics which degraded the accuracy of
the watchdog to the point that it fails to detect actual TSC problems on
older hardware which exposes slow inter CPU drifts due to firmware
manipulating the TSC to hide SMI time.

To address this and bring back sanity to the watchdog, rewrite the code
completely with a different approach:

  1) Restrict the validation against a reference clocksource to the boot
     CPU, which is usually the CPU/Socket closest to the legacy block which
     contains the reference source (HPET/ACPI-PM timer). Validate that the
     reference readout is within a bound latency so that the actual
     comparison against the TSC stays within 500ppm as long as the clocks
     are stable.

  2) Compare the TSCs of the other CPUs in a round robin fashion against
     the boot CPU in the same way the TSC synchronization on CPU hotplug
     works. This still can suffer from delayed reaction of the remote CPU
     to the SMP function call and the latency of the control variable cache
     line. But this latency is not affecting correctness. It only affects
     the accuracy. With low contention the readout latency is in the low
     nanoseconds range, which detects even slight skews between CPUs. Under
     high contention this becomes obviously less accurate, but still
     detects slow skews reliably as it solely relies on subsequent readouts
     being monotonically increasing. It just can take slightly longer to
     detect the issue.

  3) Rewrite the watchdog test so it tests the various mechanisms one by
     one and validating the result against the expectation.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Daniel J Blueman <daniel@quora.org>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Reviewed-by: Daniel J Blueman <daniel@quora.org>
Link: https://patch.msgid.link/20260123231521.926490888@kernel.org
Link: https://patch.msgid.link/87h5qeomm5.ffs@tglx
2026-03-20 13:36:32 +01:00

276 lines
7.2 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/drivers/clocksource/acpi_pm.c
*
* This file contains the ACPI PM based clocksource.
*
* This code was largely moved from the i386 timer_pm.c file
* which was (C) Dominik Brodowski <linux@brodo.de> 2003
* and contained the following comments:
*
* Driver to use the Power Management Timer (PMTMR) available in some
* southbridges as primary timing source for the Linux kernel.
*
* Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
* timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
*/
#include <linux/acpi_pmtmr.h>
#include <linux/clocksource.h>
#include <linux/timex.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/delay.h>
#include <asm/io.h>
#include <asm/time.h>
static void *suspend_resume_cb_data;
static void (*suspend_resume_callback)(void *data, bool suspend);
/*
* The I/O port the PMTMR resides at.
* The location is detected during setup_arch(),
* in arch/i386/kernel/acpi/boot.c
*/
u32 pmtmr_ioport __read_mostly;
static inline u32 read_pmtmr(void)
{
/* mask the output to 24 bits */
return inl(pmtmr_ioport) & ACPI_PM_MASK;
}
u32 acpi_pm_read_verified(void)
{
u32 v1 = 0, v2 = 0, v3 = 0;
/*
* It has been reported that because of various broken
* chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock
* source is not latched, you must read it multiple
* times to ensure a safe value is read:
*/
do {
v1 = read_pmtmr();
v2 = read_pmtmr();
v3 = read_pmtmr();
} while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
|| (v3 > v1 && v3 < v2)));
return v2;
}
void acpi_pmtmr_register_suspend_resume_callback(void (*cb)(void *data, bool suspend), void *data)
{
suspend_resume_callback = cb;
suspend_resume_cb_data = data;
}
EXPORT_SYMBOL_GPL(acpi_pmtmr_register_suspend_resume_callback);
void acpi_pmtmr_unregister_suspend_resume_callback(void)
{
suspend_resume_callback = NULL;
suspend_resume_cb_data = NULL;
}
EXPORT_SYMBOL_GPL(acpi_pmtmr_unregister_suspend_resume_callback);
static void acpi_pm_suspend(struct clocksource *cs)
{
if (suspend_resume_callback)
suspend_resume_callback(suspend_resume_cb_data, true);
}
static void acpi_pm_resume(struct clocksource *cs)
{
if (suspend_resume_callback)
suspend_resume_callback(suspend_resume_cb_data, false);
}
static u64 acpi_pm_read(struct clocksource *cs)
{
return (u64)read_pmtmr();
}
static struct clocksource clocksource_acpi_pm = {
.name = "acpi_pm",
.rating = 200,
.read = acpi_pm_read,
.mask = (u64)ACPI_PM_MASK,
.flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_CALIBRATED,
.suspend = acpi_pm_suspend,
.resume = acpi_pm_resume,
};
#ifdef CONFIG_PCI
static int acpi_pm_good;
static int __init acpi_pm_good_setup(char *__str)
{
acpi_pm_good = 1;
return 1;
}
__setup("acpi_pm_good", acpi_pm_good_setup);
static u64 acpi_pm_read_slow(struct clocksource *cs)
{
return (u64)acpi_pm_read_verified();
}
static inline void acpi_pm_need_workaround(void)
{
clocksource_acpi_pm.read = acpi_pm_read_slow;
clocksource_acpi_pm.rating = 120;
}
/*
* PIIX4 Errata:
*
* The power management timer may return improper results when read.
* Although the timer value settles properly after incrementing,
* while incrementing there is a 3 ns window every 69.8 ns where the
* timer value is indeterminate (a 4.2% chance that the data will be
* incorrect when read). As a result, the ACPI free running count up
* timer specification is violated due to erroneous reads.
*/
static void acpi_pm_check_blacklist(struct pci_dev *dev)
{
if (acpi_pm_good)
return;
/* the bug has been fixed in PIIX4M */
if (dev->revision < 3) {
pr_warn("* Found PM-Timer Bug on the chipset. Due to workarounds for a bug,\n"
"* this clock source is slow. Consider trying other clock sources\n");
acpi_pm_need_workaround();
}
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3,
acpi_pm_check_blacklist);
static void acpi_pm_check_graylist(struct pci_dev *dev)
{
if (acpi_pm_good)
return;
pr_warn("* The chipset may have PM-Timer Bug. Due to workarounds for a bug,\n"
"* this clock source is slow. If you are sure your timer does not have\n"
"* this bug, please use \"acpi_pm_good\" to disable the workaround\n");
acpi_pm_need_workaround();
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0,
acpi_pm_check_graylist);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
acpi_pm_check_graylist);
#endif
#ifndef CONFIG_X86_64
#include <asm/mach_timer.h>
#define PMTMR_EXPECTED_RATE \
((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (PIT_TICK_RATE>>10))
/*
* Some boards have the PMTMR running way too fast. We check
* the PMTMR rate against PIT channel 2 to catch these cases.
*/
static int verify_pmtmr_rate(void)
{
u64 value1, value2;
unsigned long count, delta;
mach_prepare_counter();
value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
mach_countup(&count);
value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
delta = (value2 - value1) & ACPI_PM_MASK;
/* Check that the PMTMR delta is within 5% of what we expect */
if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
pr_info("PM-Timer running at invalid rate: %lu%% of normal - aborting.\n",
100UL * delta / PMTMR_EXPECTED_RATE);
return -1;
}
return 0;
}
#else
#define verify_pmtmr_rate() (0)
#endif
/* Number of monotonicity checks to perform during initialization */
#define ACPI_PM_MONOTONICITY_CHECKS 10
/* Number of reads we try to get two different values */
#define ACPI_PM_READ_CHECKS 10000
static int __init init_acpi_pm_clocksource(void)
{
u64 value1, value2;
unsigned int i, j = 0;
if (!pmtmr_ioport)
return -ENODEV;
/* "verify" this timing source: */
for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
udelay(100 * j);
value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
if (value2 == value1)
continue;
if (value2 > value1)
break;
if ((value2 < value1) && ((value2) < 0xFFF))
break;
pr_info("PM-Timer had inconsistent results: %#llx, %#llx - aborting.\n",
value1, value2);
pmtmr_ioport = 0;
return -EINVAL;
}
if (i == ACPI_PM_READ_CHECKS) {
pr_info("PM-Timer failed consistency check (%#llx) - aborting.\n",
value1);
pmtmr_ioport = 0;
return -ENODEV;
}
}
if (verify_pmtmr_rate() != 0){
pmtmr_ioport = 0;
return -ENODEV;
}
return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);
}
/* We use fs_initcall because we want the PCI fixups to have run
* but we still need to load before device_initcall
*/
fs_initcall(init_acpi_pm_clocksource);
/*
* Allow an override of the IOPort. Stupid BIOSes do not tell us about
* the PMTimer, but we might know where it is.
*/
static int __init parse_pmtmr(char *arg)
{
unsigned int base;
int ret;
ret = kstrtouint(arg, 16, &base);
if (ret) {
pr_warn("PMTMR: invalid 'pmtmr=' value: '%s'\n", arg);
return 1;
}
pr_info("PMTMR IOPort override: 0x%04x -> 0x%04x\n", pmtmr_ioport,
base);
pmtmr_ioport = base;
return 1;
}
__setup("pmtmr=", parse_pmtmr);