Merge branches 'pm-cpuidle', 'pm-opp' and 'pm-sleep'

Merge cpuidle updates, OPP (operating performance points) library updates, and updates related to system suspend and hibernation for 7.1-rc1: - Refine stopped tick handling in the menu cpuidle governor and rearrange stopped tick handling in the teo cpuidle governor (Rafael Wysocki) - Add Panther Lake C-states table to the intel_idle driver (Artem Bityutskiy) - Clean up dead dependencies on CPU_IDLE in Kconfig (Julian Braha) - Simplify cpuidle_register_device() with guard() (Huisong Li) - Use performance level if available to distinguish between rates in OPP debugfs (Manivannan Sadhasivam) - Fix scoped_guard in dev_pm_opp_xlate_required_opp() (Viresh Kumar) - Return -ENODATA if the snapshot image is not loaded (Alberto Garcia) - Remove inclusion of crypto/hash.h from hibernate_64.c on x86 (Eric Biggers) * pm-cpuidle: cpuidle: Simplify cpuidle_register_device() with guard() cpuidle: clean up dead dependencies on CPU_IDLE in Kconfig intel_idle: Add Panther Lake C-states table cpuidle: governors: teo: Rearrange stopped tick handling cpuidle: governors: menu: Refine stopped tick handling * pm-opp: OPP: Move break out of scoped_guard in dev_pm_opp_xlate_required_opp() OPP: debugfs: Use performance level if available to distinguish between rates * pm-sleep: PM: hibernate: return -ENODATA if the snapshot image is not loaded PM: hibernate: x86: Remove inclusion of crypto/hash.h
2026-04-18 06:44:00 -04:00 · 2026-04-10 12:37:27 +02:00
parent 83e990310d 629be87e0d e810f1f801 949a5ed082
commit 7431d90cfc
12 changed files with 114 additions and 78 deletions
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -14,8 +14,6 @@
 #include <linux/kdebug.h>
 #include <linux/pgtable.h>

-#include <crypto/hash.h>
-
 #include <asm/e820/api.h>
 #include <asm/init.h>
 #include <asm/proto.h>
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -81,7 +81,7 @@ config HALTPOLL_CPUIDLE
 	 before halting in the guest (more efficient than polling in the
 	 host via halt_poll_ns for some scenarios).

-endif
+endif # CPU_IDLE

 config ARCH_NEEDS_CPU_IDLE_COUPLED
 	def_bool n
--- a/drivers/cpuidle/Kconfig.mips
+++ b/drivers/cpuidle/Kconfig.mips
@@ -4,7 +4,7 @@
 #
 config MIPS_CPS_CPUIDLE
 	bool "CPU Idle driver for MIPS CPS platforms"
-	depends on CPU_IDLE && MIPS_CPS
+	depends on MIPS_CPS
 	depends on SYS_SUPPORTS_MIPS_CPS
 	select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT || CPU_MIPSR6
 	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
--- a/drivers/cpuidle/Kconfig.powerpc
+++ b/drivers/cpuidle/Kconfig.powerpc
@@ -4,7 +4,6 @@
 #
 config PSERIES_CPUIDLE
 	bool "Cpuidle driver for pSeries platforms"
-	depends on CPU_IDLE
 	depends on PPC_PSERIES
 	default y
 	help
@@ -13,7 +12,6 @@ config PSERIES_CPUIDLE

 config POWERNV_CPUIDLE
 	bool "Cpuidle driver for powernv platforms"
-	depends on CPU_IDLE
 	depends on PPC_POWERNV
 	default y
 	help
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -679,16 +679,16 @@ int cpuidle_register_device(struct cpuidle_device *dev)
 	if (!dev)
 		return -EINVAL;

-	mutex_lock(&cpuidle_lock);
+	guard(mutex)(&cpuidle_lock);

 	if (dev->registered)
-		goto out_unlock;
+		return ret;

 	__cpuidle_device_init(dev);

 	ret = __cpuidle_register_device(dev);
 	if (ret)
-		goto out_unlock;
+		return ret;

 	ret = cpuidle_add_sysfs(dev);
 	if (ret)
@@ -700,16 +700,14 @@ int cpuidle_register_device(struct cpuidle_device *dev)

 	cpuidle_install_idle_handler();

-out_unlock:
-	mutex_unlock(&cpuidle_lock);
-
 	return ret;

 out_sysfs:
 	cpuidle_remove_sysfs(dev);
 out_unregister:
 	__cpuidle_unregister_device(dev);
-	goto out_unlock;
+
+	return ret;
 }

 EXPORT_SYMBOL_GPL(cpuidle_register_device);
--- a/drivers/cpuidle/governors/gov.h
+++ b/drivers/cpuidle/governors/gov.h
@@ -10,5 +10,10 @@
 * check the time till the closest expected timer event.
 */
 #define RESIDENCY_THRESHOLD_NS	(15 * NSEC_PER_USEC)
+/*
+ * If the closest timer is in this range, the governor idle state selection need
+ * not be adjusted after the scheduler tick has been stopped.
+ */
+#define SAFE_TIMER_RANGE_NS	(2 * TICK_NSEC)

 #endif /* __CPUIDLE_GOVERNOR_H */
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -261,13 +261,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
 		/*
 		 * If the tick is already stopped, the cost of possible short
-		 * idle duration misprediction is much higher, because the CPU
-		 * may be stuck in a shallow idle state for a long time as a
-		 * result of it.  In that case, say we might mispredict and use
-		 * the known time till the closest timer event for the idle
-		 * state selection.
+		 * idle duration misprediction is higher because the CPU may get
+		 * stuck in a shallow idle state then.  To avoid that, if
+		 * predicted_ns is small enough, say it might be mispredicted
+		 * and use the known time till the closest timer for idle state
+		 * selection unless that timer is going to trigger within
+		 * SAFE_TIMER_RANGE_NS in which case it can be regarded as a
+		 * sufficient safety net.
 		 */
-		if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
+		if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC &&
+		    data->next_timer_ns > SAFE_TIMER_RANGE_NS)
 			predicted_ns = data->next_timer_ns;
 	} else {
 		/*
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -407,50 +407,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * better choice.
 	 */
 	if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
-		int min_idx = idx0;
-
-		if (tick_nohz_tick_stopped()) {
-			/*
-			 * Look for the shallowest idle state below the current
-			 * candidate one whose target residency is at least
-			 * equal to the tick period length.
-			 */
-			while (min_idx < idx &&
-			       drv->states[min_idx].target_residency_ns < TICK_NSEC)
-				min_idx++;
-
-			/*
-			 * Avoid selecting a state with a lower index, but with
-			 * the same target residency as the current candidate
-			 * one.
-			 */
-			if (drv->states[min_idx].target_residency_ns ==
-					drv->states[idx].target_residency_ns)
-				goto constraint;
-		}
-
-		/*
-		 * If the minimum state index is greater than or equal to the
-		 * index of the state with the maximum intercepts metric and
-		 * the corresponding state is enabled, there is no need to look
-		 * at the deeper states.
-		 */
-		if (min_idx >= intercept_max_idx &&
-		    !dev->states_usage[min_idx].disable) {
-			idx = min_idx;
-			goto constraint;
-		}
-
 		/*
 		 * Look for the deepest enabled idle state, at most as deep as
 		 * the one with the maximum intercepts metric, whose target
 		 * residency had not been greater than the idle duration in over
 		 * a half of the relevant cases in the past.
-		 *
-		 * Take the possible duration limitation present if the tick
-		 * has been stopped already into account.
 		 */
-		for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) {
+		for (i = idx - 1, intercept_sum = 0; i >= idx0; i--) {
 			intercept_sum += cpu_data->state_bins[i].intercepts;

 			if (dev->states_usage[i].disable)
@@ -463,7 +426,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		}
 	}

-constraint:
 	/*
 	 * If there is a latency constraint, it may be necessary to select an
 	 * idle state shallower than the current candidate one.
@@ -472,13 +434,13 @@ constraint:
 		idx = constraint_idx;

 	/*
-	 * If either the candidate state is state 0 or its target residency is
-	 * low enough, there is basically nothing more to do, but if the sleep
-	 * length is not updated, the subsequent wakeup will be counted as an
-	 * "intercept" which may be problematic in the cases when timer wakeups
-	 * are dominant.  Namely, it may effectively prevent deeper idle states
-	 * from being selected at one point even if no imminent timers are
-	 * scheduled.
+	 * If the tick has not been stopped and either the candidate state is
+	 * state 0 or its target residency is low enough, there is basically
+	 * nothing more to do, but if the sleep length is not updated, the
+	 * subsequent wakeup will be counted as an "intercept".  That may be
+	 * problematic in the cases when timer wakeups are dominant because it
+	 * may effectively prevent deeper idle states from being selected at one
+	 * point even if no imminent timers are scheduled.
 	 *
 	 * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
 	 * CPU are unlikely (user space has a default 50 us slack value for
@@ -494,7 +456,8 @@ constraint:
 	 * shallow idle states regardless of the wakeup type, so the sleep
 	 * length need not be known in that case.
 	 */
-	if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
+	if (!tick_nohz_tick_stopped() && (!idx ||
+	     drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
 	    (2 * cpu_data->short_idles >= cpu_data->total ||
 	     latency_req < LATENCY_THRESHOLD_NS))
 		goto out_tick;
@@ -502,6 +465,30 @@ constraint:
 	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
 	cpu_data->sleep_length_ns = duration_ns;

+	/*
+	 * If the tick has been stopped and the closest timer is too far away,
+	 * update the selection to prevent the CPU from getting stuck in a
+	 * shallow idle state for too long.
+	 */
+	if (tick_nohz_tick_stopped() && duration_ns > SAFE_TIMER_RANGE_NS &&
+	    drv->states[idx].target_residency_ns < TICK_NSEC) {
+		/*
+		 * Look for the deepest enabled idle state with exit latency
+		 * within the PM QoS limit and with target residency within
+		 * duration_ns.
+		 */
+		for (i = constraint_idx; i > idx; i--) {
+			if (dev->states_usage[i].disable)
+				continue;
+
+			if (drv->states[i].target_residency_ns <= duration_ns) {
+				idx = i;
+				break;
+			}
+		}
+		return idx;
+	}
+
 	if (!idx)
 		goto out_tick;

--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -983,6 +983,43 @@ static struct cpuidle_state mtl_l_cstates[] __initdata = {
 		.enter = NULL }
 };

+static struct cpuidle_state ptl_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 10,
+		.target_residency = 10,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6S",
+		.desc = "MWAIT 0x21",
+		.flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 300,
+		.target_residency = 300,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C10",
+		.desc = "MWAIT 0x60",
+		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 370,
+		.target_residency = 2500,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static struct cpuidle_state gmt_cstates[] __initdata = {
 	{
 		.name = "C1",
@@ -1561,6 +1598,10 @@ static const struct idle_cpu idle_cpu_mtl_l __initconst = {
 	.state_table = mtl_l_cstates,
 };

+static const struct idle_cpu idle_cpu_ptl __initconst = {
+	.state_table = ptl_cstates,
+};
+
 static const struct idle_cpu idle_cpu_gmt __initconst = {
 	.state_table = gmt_cstates,
 };
@@ -1669,6 +1710,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_VFM(INTEL_ALDERLAKE,		&idle_cpu_adl),
 	X86_MATCH_VFM(INTEL_ALDERLAKE_L,	&idle_cpu_adl_l),
 	X86_MATCH_VFM(INTEL_METEORLAKE_L,	&idle_cpu_mtl_l),
+	X86_MATCH_VFM(INTEL_PANTHERLAKE_L,	&idle_cpu_ptl),
 	X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,	&idle_cpu_gmt),
 	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X,	&idle_cpu_spr),
 	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X,	&idle_cpu_spr),
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2742,8 +2742,8 @@ struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
 					break;
 				}
 			}
-			break;
 		}
+		break;
 	}

 	if (IS_ERR(dest_opp)) {
--- a/drivers/opp/debugfs.c
+++ b/drivers/opp/debugfs.c
@@ -130,22 +130,24 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 {
 	struct dentry *pdentry = opp_table->dentry;
 	struct dentry *d;
-	unsigned long id;
-	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
+	char name[36];	/* "opp:"(4) + u64(20) + "-" (1) + u32(10) + NULL(1) */

 	/*
 	 * Get directory name for OPP.
 	 *
-	 * - Normally rate is unique to each OPP, use it to get unique opp-name.
+	 * - Normally rate is unique to each OPP, use it to get unique opp-name,
+	 *   together with performance level if available.
 	 * - For some devices rate isn't available or there are multiple, use
 	 *   index instead for them.
 	 */
-	if (likely(opp_table->clk_count == 1 && opp->rates[0]))
-		id = opp->rates[0];
-	else
-		id = _get_opp_count(opp_table);
-
-	snprintf(name, sizeof(name), "opp:%lu", id);
+	if (likely(opp_table->clk_count == 1 && opp->rates[0])) {
+		if (opp->level == OPP_LEVEL_UNSET)
+			snprintf(name, sizeof(name), "opp:%lu", opp->rates[0]);
+		else
+			snprintf(name, sizeof(name), "opp:%lu-%u", opp->rates[0], opp->level);
+	} else {
+		snprintf(name, sizeof(name), "opp:%u", _get_opp_count(opp_table));
+	}

 	/* Create per-opp directory */
 	d = debugfs_create_dir(name, pdentry);
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -322,11 +322,14 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = snapshot_write_finalize(&data->handle);
 		if (error)
 			break;
-		if (data->mode != O_WRONLY || !data->frozen ||
-		    !snapshot_image_loaded(&data->handle)) {
+		if (data->mode != O_WRONLY || !data->frozen) {
 			error = -EPERM;
 			break;
 		}
+		if (!snapshot_image_loaded(&data->handle)) {
+			error = -ENODATA;
+			break;
+		}
 		error = hibernation_restore(data->platform_support);
 		break;