diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a595953f1d6d..e72d26acae79 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -14,8 +14,6 @@ #include #include -#include - #include #include #include diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index cac5997dca50..d6d8386d3f02 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -81,7 +81,7 @@ config HALTPOLL_CPUIDLE before halting in the guest (more efficient than polling in the host via halt_poll_ns for some scenarios). -endif +endif # CPU_IDLE config ARCH_NEEDS_CPU_IDLE_COUPLED def_bool n diff --git a/drivers/cpuidle/Kconfig.mips b/drivers/cpuidle/Kconfig.mips index c3c011af4a35..88728b2b4ea0 100644 --- a/drivers/cpuidle/Kconfig.mips +++ b/drivers/cpuidle/Kconfig.mips @@ -4,7 +4,7 @@ # config MIPS_CPS_CPUIDLE bool "CPU Idle driver for MIPS CPS platforms" - depends on CPU_IDLE && MIPS_CPS + depends on MIPS_CPS depends on SYS_SUPPORTS_MIPS_CPS select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT || CPU_MIPSR6 select GENERIC_CLOCKEVENTS_BROADCAST if SMP diff --git a/drivers/cpuidle/Kconfig.powerpc b/drivers/cpuidle/Kconfig.powerpc index a797a02b7b6f..1931ac8faffb 100644 --- a/drivers/cpuidle/Kconfig.powerpc +++ b/drivers/cpuidle/Kconfig.powerpc @@ -4,7 +4,6 @@ # config PSERIES_CPUIDLE bool "Cpuidle driver for pSeries platforms" - depends on CPU_IDLE depends on PPC_PSERIES default y help @@ -13,7 +12,6 @@ config PSERIES_CPUIDLE config POWERNV_CPUIDLE bool "Cpuidle driver for powernv platforms" - depends on CPU_IDLE depends on PPC_POWERNV default y help diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c7876e9e024f..8c037db46792 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -679,16 +679,16 @@ int cpuidle_register_device(struct cpuidle_device *dev) if (!dev) return -EINVAL; - mutex_lock(&cpuidle_lock); + guard(mutex)(&cpuidle_lock); if (dev->registered) - goto out_unlock; + return ret; __cpuidle_device_init(dev); ret = __cpuidle_register_device(dev); if (ret) - goto out_unlock; + return ret; ret = cpuidle_add_sysfs(dev); if (ret) @@ -700,16 +700,14 @@ int cpuidle_register_device(struct cpuidle_device *dev) cpuidle_install_idle_handler(); -out_unlock: - mutex_unlock(&cpuidle_lock); - return ret; out_sysfs: cpuidle_remove_sysfs(dev); out_unregister: __cpuidle_unregister_device(dev); - goto out_unlock; + + return ret; } EXPORT_SYMBOL_GPL(cpuidle_register_device); diff --git a/drivers/cpuidle/governors/gov.h b/drivers/cpuidle/governors/gov.h index 99e067d9668c..cd06a2e7b506 100644 --- a/drivers/cpuidle/governors/gov.h +++ b/drivers/cpuidle/governors/gov.h @@ -10,5 +10,10 @@ * check the time till the closest expected timer event. */ #define RESIDENCY_THRESHOLD_NS (15 * NSEC_PER_USEC) +/* + * If the closest timer is in this range, the governor idle state selection need + * not be adjusted after the scheduler tick has been stopped. + */ +#define SAFE_TIMER_RANGE_NS (2 * TICK_NSEC) #endif /* __CPUIDLE_GOVERNOR_H */ diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 899ff16ff1fe..544a5d593007 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -261,13 +261,16 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); /* * If the tick is already stopped, the cost of possible short - * idle duration misprediction is much higher, because the CPU - * may be stuck in a shallow idle state for a long time as a - * result of it. In that case, say we might mispredict and use - * the known time till the closest timer event for the idle - * state selection. + * idle duration misprediction is higher because the CPU may get + * stuck in a shallow idle state then. To avoid that, if + * predicted_ns is small enough, say it might be mispredicted + * and use the known time till the closest timer for idle state + * selection unless that timer is going to trigger within + * SAFE_TIMER_RANGE_NS in which case it can be regarded as a + * sufficient safety net. */ - if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC) + if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC && + data->next_timer_ns > SAFE_TIMER_RANGE_NS) predicted_ns = data->next_timer_ns; } else { /* diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bec0142377b8..ac43b9b013b3 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -407,50 +407,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * better choice. */ if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { - int min_idx = idx0; - - if (tick_nohz_tick_stopped()) { - /* - * Look for the shallowest idle state below the current - * candidate one whose target residency is at least - * equal to the tick period length. - */ - while (min_idx < idx && - drv->states[min_idx].target_residency_ns < TICK_NSEC) - min_idx++; - - /* - * Avoid selecting a state with a lower index, but with - * the same target residency as the current candidate - * one. - */ - if (drv->states[min_idx].target_residency_ns == - drv->states[idx].target_residency_ns) - goto constraint; - } - - /* - * If the minimum state index is greater than or equal to the - * index of the state with the maximum intercepts metric and - * the corresponding state is enabled, there is no need to look - * at the deeper states. - */ - if (min_idx >= intercept_max_idx && - !dev->states_usage[min_idx].disable) { - idx = min_idx; - goto constraint; - } - /* * Look for the deepest enabled idle state, at most as deep as * the one with the maximum intercepts metric, whose target * residency had not been greater than the idle duration in over * a half of the relevant cases in the past. - * - * Take the possible duration limitation present if the tick - * has been stopped already into account. */ - for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) { + for (i = idx - 1, intercept_sum = 0; i >= idx0; i--) { intercept_sum += cpu_data->state_bins[i].intercepts; if (dev->states_usage[i].disable) @@ -463,7 +426,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } } -constraint: /* * If there is a latency constraint, it may be necessary to select an * idle state shallower than the current candidate one. @@ -472,13 +434,13 @@ constraint: idx = constraint_idx; /* - * If either the candidate state is state 0 or its target residency is - * low enough, there is basically nothing more to do, but if the sleep - * length is not updated, the subsequent wakeup will be counted as an - * "intercept" which may be problematic in the cases when timer wakeups - * are dominant. Namely, it may effectively prevent deeper idle states - * from being selected at one point even if no imminent timers are - * scheduled. + * If the tick has not been stopped and either the candidate state is + * state 0 or its target residency is low enough, there is basically + * nothing more to do, but if the sleep length is not updated, the + * subsequent wakeup will be counted as an "intercept". That may be + * problematic in the cases when timer wakeups are dominant because it + * may effectively prevent deeper idle states from being selected at one + * point even if no imminent timers are scheduled. * * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one * CPU are unlikely (user space has a default 50 us slack value for @@ -494,7 +456,8 @@ constraint: * shallow idle states regardless of the wakeup type, so the sleep * length need not be known in that case. */ - if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && + if (!tick_nohz_tick_stopped() && (!idx || + drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && (2 * cpu_data->short_idles >= cpu_data->total || latency_req < LATENCY_THRESHOLD_NS)) goto out_tick; @@ -502,6 +465,30 @@ constraint: duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; + /* + * If the tick has been stopped and the closest timer is too far away, + * update the selection to prevent the CPU from getting stuck in a + * shallow idle state for too long. + */ + if (tick_nohz_tick_stopped() && duration_ns > SAFE_TIMER_RANGE_NS && + drv->states[idx].target_residency_ns < TICK_NSEC) { + /* + * Look for the deepest enabled idle state with exit latency + * within the PM QoS limit and with target residency within + * duration_ns. + */ + for (i = constraint_idx; i > idx; i--) { + if (dev->states_usage[i].disable) + continue; + + if (drv->states[i].target_residency_ns <= duration_ns) { + idx = i; + break; + } + } + return idx; + } + if (!idx) goto out_tick; diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f49c939d636f..f49354e37777 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -983,6 +983,43 @@ static struct cpuidle_state mtl_l_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state ptl_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, + .target_residency = 10, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6S", + .desc = "MWAIT 0x21", + .flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, + .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 370, + .target_residency = 2500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state gmt_cstates[] __initdata = { { .name = "C1", @@ -1561,6 +1598,10 @@ static const struct idle_cpu idle_cpu_mtl_l __initconst = { .state_table = mtl_l_cstates, }; +static const struct idle_cpu idle_cpu_ptl __initconst = { + .state_table = ptl_cstates, +}; + static const struct idle_cpu idle_cpu_gmt __initconst = { .state_table = gmt_cstates, }; @@ -1669,6 +1710,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_VFM(INTEL_ALDERLAKE, &idle_cpu_adl), X86_MATCH_VFM(INTEL_ALDERLAKE_L, &idle_cpu_adl_l), X86_MATCH_VFM(INTEL_METEORLAKE_L, &idle_cpu_mtl_l), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &idle_cpu_ptl), X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &idle_cpu_gmt), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr), diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 866641666e41..da3f5eba4341 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2742,8 +2742,8 @@ struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, break; } } - break; } + break; } if (IS_ERR(dest_opp)) { diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c index 8fc6238b1728..61506d30d5ff 100644 --- a/drivers/opp/debugfs.c +++ b/drivers/opp/debugfs.c @@ -130,22 +130,24 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table) { struct dentry *pdentry = opp_table->dentry; struct dentry *d; - unsigned long id; - char name[25]; /* 20 chars for 64 bit value + 5 (opp:\0) */ + char name[36]; /* "opp:"(4) + u64(20) + "-" (1) + u32(10) + NULL(1) */ /* * Get directory name for OPP. * - * - Normally rate is unique to each OPP, use it to get unique opp-name. + * - Normally rate is unique to each OPP, use it to get unique opp-name, + * together with performance level if available. * - For some devices rate isn't available or there are multiple, use * index instead for them. */ - if (likely(opp_table->clk_count == 1 && opp->rates[0])) - id = opp->rates[0]; - else - id = _get_opp_count(opp_table); - - snprintf(name, sizeof(name), "opp:%lu", id); + if (likely(opp_table->clk_count == 1 && opp->rates[0])) { + if (opp->level == OPP_LEVEL_UNSET) + snprintf(name, sizeof(name), "opp:%lu", opp->rates[0]); + else + snprintf(name, sizeof(name), "opp:%lu-%u", opp->rates[0], opp->level); + } else { + snprintf(name, sizeof(name), "opp:%u", _get_opp_count(opp_table)); + } /* Create per-opp directory */ d = debugfs_create_dir(name, pdentry); diff --git a/kernel/power/user.c b/kernel/power/user.c index 4401cfe26e5c..be77f3556bd7 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -322,11 +322,14 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = snapshot_write_finalize(&data->handle); if (error) break; - if (data->mode != O_WRONLY || !data->frozen || - !snapshot_image_loaded(&data->handle)) { + if (data->mode != O_WRONLY || !data->frozen) { error = -EPERM; break; } + if (!snapshot_image_loaded(&data->handle)) { + error = -ENODATA; + break; + } error = hibernation_restore(data->platform_support); break;