mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Merge tag 'sched-core-2026-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Fair scheduling updates:
- Skip SCHED_IDLE rq for SCHED_IDLE tasks (Christian Loehle)
- Remove superfluous rcu_read_lock() in the wakeup path (K Prateek Nayak)
- Simplify the entry condition for update_idle_cpu_scan() (K Prateek Nayak)
- Simplify SIS_UTIL handling in select_idle_cpu() (K Prateek Nayak)
- Avoid overflow in enqueue_entity() (K Prateek Nayak)
- Update overutilized detection (Vincent Guittot)
- Prevent negative lag increase during delayed dequeue (Vincent Guittot)
- Clear buddies for preempt_short (Vincent Guittot)
- Implement more complex proportional newidle balance (Peter Zijlstra)
- Increase weight bits for avg_vruntime (Peter Zijlstra)
- Use full weight to __calc_delta() (Peter Zijlstra)
RT and DL scheduling updates:
- Fix incorrect schedstats for rt and dl thread (Dengjun Su)
- Skip group schedulable check with rt_group_sched=0 (Michal Koutný)
- Move group schedulability check to sched_rt_global_validate()
(Michal Koutný)
- Add reporting of runtime left & abs deadline to sched_getattr()
for DEADLINE tasks (Tommaso Cucinotta)
Scheduling topology updates by K Prateek Nayak:
- Compute sd_weight considering cpuset partitions
- Extract "imb_numa_nr" calculation into a separate helper
- Allocate per-CPU sched_domain_shared in s_data
- Switch to assigning "sd->shared" from s_data
- Remove sched_domain_shared allocation with sd_data
Energy-aware scheduling updates:
- Filter false overloaded_group case for EAS (Vincent Guittot)
- PM: EM: Switch to rcu_dereference_all() in wakeup path
(Dietmar Eggemann)
Infrastructure updates:
- Replace use of system_unbound_wq with system_dfl_wq (Marco Crivellari)
Proxy scheduling updates by John Stultz:
- Make class_schedulers avoid pushing current, and get rid of proxy_tag_curr()
- Minimise repeated sched_proxy_exec() checking
- Fix potentially missing balancing with Proxy Exec
- Fix and improve task::blocked_on et al handling
- Add assert_balance_callbacks_empty() helper
- Add logic to zap balancing callbacks if we pick again
- Move attach_one_task() and attach_task() helpers to sched.h
- Handle blocked-waiter migration (and return migration)
- Add K Prateek Nayak to scheduler reviewers for proxy execution
Misc cleanups and fixes by John Stultz, Joseph Salisbury, Peter
Zijlstra, K Prateek Nayak, Michal Koutný, Randy Dunlap, Shrikanth
Hegde, Vincent Guittot, Zhan Xusheng, Xie Yuanbin and Vincent Guittot"
* tag 'sched-core-2026-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits)
sched/eevdf: Clear buddies for preempt_short
sched/rt: Cleanup global RT bandwidth functions
sched/rt: Move group schedulability check to sched_rt_global_validate()
sched/rt: Skip group schedulable check with rt_group_sched=0
sched/fair: Avoid overflow in enqueue_entity()
sched: Use u64 for bandwidth ratio calculations
sched/fair: Prevent negative lag increase during delayed dequeue
sched/fair: Use sched_energy_enabled()
sched: Handle blocked-waiter migration (and return migration)
sched: Move attach_one_task and attach_task helpers to sched.h
sched: Add logic to zap balance callbacks if we pick again
sched: Add assert_balance_callbacks_empty helper
sched/locking: Add special p->blocked_on==PROXY_WAKING value for proxy return-migration
sched: Fix modifying donor->blocked on without proper locking
locking: Add task::blocked_lock to serialize blocked_on state
sched: Fix potentially missing balancing with Proxy Exec
sched: Minimise repeated sched_proxy_exec() checking
sched: Make class_schedulers avoid pushing current, and get rid of proxy_tag_curr()
MAINTAINERS: Add K Prateek Nayak to scheduler reviewers
sched/core: Get this cpu once in ttwu_queue_cond()
...
This commit is contained in:
@@ -23708,6 +23708,7 @@ R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
|
||||
R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
|
||||
R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
|
||||
R: Valentin Schneider <vschneid@redhat.com> (TOPOLOGY)
|
||||
R: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
|
||||
|
||||
@@ -136,9 +136,6 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm)
|
||||
}
|
||||
#endif
|
||||
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
extern void mm_init_global_asid(struct mm_struct *mm);
|
||||
extern void mm_free_global_asid(struct mm_struct *mm);
|
||||
|
||||
|
||||
@@ -172,6 +172,28 @@ struct tlb_state_shared {
|
||||
};
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
|
||||
|
||||
/*
|
||||
* Please ignore the name of this function. It should be called
|
||||
* switch_to_kernel_thread().
|
||||
*
|
||||
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
|
||||
* kernel thread or other context without an mm. Acceptable implementations
|
||||
* include doing nothing whatsoever, switching to init_mm, or various clever
|
||||
* lazy tricks to try to minimize TLB flushes.
|
||||
*
|
||||
* The scheduler reserves the right to call enter_lazy_tlb() several times
|
||||
* in a row. It will notify us that we're going back to a real mm by
|
||||
* calling switch_mm_irqs_off().
|
||||
*/
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
static __always_inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
{
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
||||
return;
|
||||
|
||||
this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
|
||||
}
|
||||
|
||||
bool nmi_uaccess_okay(void);
|
||||
#define nmi_uaccess_okay nmi_uaccess_okay
|
||||
|
||||
@@ -480,6 +502,10 @@ static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
#else /* !MODULE */
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
__compiletime_error("enter_lazy_tlb() should not be used in modules");
|
||||
#endif /* !MODULE */
|
||||
|
||||
static inline void __native_tlb_flush_global(unsigned long cr4)
|
||||
|
||||
@@ -971,27 +971,6 @@ reload_tlb:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Please ignore the name of this function. It should be called
|
||||
* switch_to_kernel_thread().
|
||||
*
|
||||
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
|
||||
* kernel thread or other context without an mm. Acceptable implementations
|
||||
* include doing nothing whatsoever, switching to init_mm, or various clever
|
||||
* lazy tricks to try to minimize TLB flushes.
|
||||
*
|
||||
* The scheduler reserves the right to call enter_lazy_tlb() several times
|
||||
* in a row. It will notify us that we're going back to a real mm by
|
||||
* calling switch_mm_irqs_off().
|
||||
*/
|
||||
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
{
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
||||
return;
|
||||
|
||||
this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Using a temporary mm allows to set temporary mappings that are not accessible
|
||||
* by other CPUs. Such mappings are needed to perform sensitive memory writes
|
||||
|
||||
@@ -248,7 +248,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
|
||||
struct em_perf_state *ps;
|
||||
int i;
|
||||
|
||||
WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n");
|
||||
lockdep_assert(rcu_read_lock_any_held());
|
||||
|
||||
if (!sum_util)
|
||||
return 0;
|
||||
@@ -267,7 +267,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
|
||||
* Find the lowest performance state of the Energy Model above the
|
||||
* requested performance.
|
||||
*/
|
||||
em_table = rcu_dereference(pd->em_table);
|
||||
em_table = rcu_dereference_all(pd->em_table);
|
||||
i = em_pd_get_efficient_state(em_table->state, pd, max_util);
|
||||
ps = &em_table->state[i];
|
||||
|
||||
|
||||
@@ -1239,6 +1239,7 @@ struct task_struct {
|
||||
#endif
|
||||
|
||||
struct mutex *blocked_on; /* lock we're blocked on */
|
||||
raw_spinlock_t blocked_lock;
|
||||
|
||||
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
|
||||
/*
|
||||
@@ -2180,61 +2181,85 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);
|
||||
})
|
||||
|
||||
#ifndef CONFIG_PREEMPT_RT
|
||||
|
||||
/*
|
||||
* With proxy exec, if a task has been proxy-migrated, it may be a donor
|
||||
* on a cpu that it can't actually run on. Thus we need a special state
|
||||
* to denote that the task is being woken, but that it needs to be
|
||||
* evaluated for return-migration before it is run. So if the task is
|
||||
* blocked_on PROXY_WAKING, return migrate it before running it.
|
||||
*/
|
||||
#define PROXY_WAKING ((struct mutex *)(-1L))
|
||||
|
||||
static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
|
||||
{
|
||||
struct mutex *m = p->blocked_on;
|
||||
|
||||
if (m)
|
||||
lockdep_assert_held_once(&m->wait_lock);
|
||||
return m;
|
||||
lockdep_assert_held_once(&p->blocked_lock);
|
||||
return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on;
|
||||
}
|
||||
|
||||
static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
|
||||
{
|
||||
struct mutex *blocked_on = READ_ONCE(p->blocked_on);
|
||||
|
||||
WARN_ON_ONCE(!m);
|
||||
/* The task should only be setting itself as blocked */
|
||||
WARN_ON_ONCE(p != current);
|
||||
/* Currently we serialize blocked_on under the mutex::wait_lock */
|
||||
lockdep_assert_held_once(&m->wait_lock);
|
||||
/* Currently we serialize blocked_on under the task::blocked_lock */
|
||||
lockdep_assert_held_once(&p->blocked_lock);
|
||||
/*
|
||||
* Check ensure we don't overwrite existing mutex value
|
||||
* with a different mutex. Note, setting it to the same
|
||||
* lock repeatedly is ok.
|
||||
*/
|
||||
WARN_ON_ONCE(blocked_on && blocked_on != m);
|
||||
WRITE_ONCE(p->blocked_on, m);
|
||||
}
|
||||
|
||||
static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
|
||||
{
|
||||
guard(raw_spinlock_irqsave)(&m->wait_lock);
|
||||
__set_task_blocked_on(p, m);
|
||||
WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
|
||||
p->blocked_on = m;
|
||||
}
|
||||
|
||||
static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
|
||||
{
|
||||
if (m) {
|
||||
struct mutex *blocked_on = READ_ONCE(p->blocked_on);
|
||||
|
||||
/* Currently we serialize blocked_on under the mutex::wait_lock */
|
||||
lockdep_assert_held_once(&m->wait_lock);
|
||||
/*
|
||||
* There may be cases where we re-clear already cleared
|
||||
* blocked_on relationships, but make sure we are not
|
||||
* clearing the relationship with a different lock.
|
||||
*/
|
||||
WARN_ON_ONCE(blocked_on && blocked_on != m);
|
||||
}
|
||||
WRITE_ONCE(p->blocked_on, NULL);
|
||||
/* Currently we serialize blocked_on under the task::blocked_lock */
|
||||
lockdep_assert_held_once(&p->blocked_lock);
|
||||
/*
|
||||
* There may be cases where we re-clear already cleared
|
||||
* blocked_on relationships, but make sure we are not
|
||||
* clearing the relationship with a different lock.
|
||||
*/
|
||||
WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
|
||||
p->blocked_on = NULL;
|
||||
}
|
||||
|
||||
static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
|
||||
{
|
||||
guard(raw_spinlock_irqsave)(&m->wait_lock);
|
||||
guard(raw_spinlock_irqsave)(&p->blocked_lock);
|
||||
__clear_task_blocked_on(p, m);
|
||||
}
|
||||
|
||||
static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
|
||||
{
|
||||
/* Currently we serialize blocked_on under the task::blocked_lock */
|
||||
lockdep_assert_held_once(&p->blocked_lock);
|
||||
|
||||
if (!sched_proxy_exec()) {
|
||||
__clear_task_blocked_on(p, m);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Don't set PROXY_WAKING if blocked_on was already cleared */
|
||||
if (!p->blocked_on)
|
||||
return;
|
||||
/*
|
||||
* There may be cases where we set PROXY_WAKING on tasks that were
|
||||
* already set to waking, but make sure we are not changing
|
||||
* the relationship with a different lock.
|
||||
*/
|
||||
WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
|
||||
p->blocked_on = PROXY_WAKING;
|
||||
}
|
||||
|
||||
static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
|
||||
{
|
||||
guard(raw_spinlock_irqsave)(&p->blocked_lock);
|
||||
__set_task_blocked_on_waking(p, m);
|
||||
}
|
||||
|
||||
#else
|
||||
static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
|
||||
{
|
||||
@@ -2243,6 +2268,14 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute
|
||||
static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
|
||||
{
|
||||
}
|
||||
#endif /* !CONFIG_PREEMPT_RT */
|
||||
|
||||
static __always_inline bool need_resched(void)
|
||||
|
||||
@@ -95,6 +95,7 @@ struct sched_domain {
|
||||
unsigned int newidle_call;
|
||||
unsigned int newidle_success;
|
||||
unsigned int newidle_ratio;
|
||||
u64 newidle_stamp;
|
||||
u64 max_newidle_lb_cost;
|
||||
unsigned long last_decay_max_lb_cost;
|
||||
|
||||
@@ -141,18 +142,30 @@ struct sched_domain {
|
||||
|
||||
unsigned int span_weight;
|
||||
/*
|
||||
* Span of all CPUs in this domain.
|
||||
* See sched_domain_span(), on why flex arrays are broken.
|
||||
*
|
||||
* NOTE: this field is variable length. (Allocated dynamically
|
||||
* by attaching extra space to the end of the structure,
|
||||
* depending on how many CPUs the kernel has booted up with)
|
||||
*/
|
||||
unsigned long span[];
|
||||
*/
|
||||
};
|
||||
|
||||
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
|
||||
{
|
||||
return to_cpumask(sd->span);
|
||||
/*
|
||||
* Turns out that C flexible arrays are fundamentally broken since it
|
||||
* is allowed for offsetof(*sd, span) < sizeof(*sd), this means that
|
||||
* structure initialzation *sd = { ... }; which writes every byte
|
||||
* inside sizeof(*type), will over-write the start of the flexible
|
||||
* array.
|
||||
*
|
||||
* Luckily, the way we allocate sched_domain is by:
|
||||
*
|
||||
* sizeof(*sd) + cpumask_size()
|
||||
*
|
||||
* this means that we have sufficient space for the whole flex array
|
||||
* *outside* of sizeof(*sd). So use that, and avoid using sd->span.
|
||||
*/
|
||||
unsigned long *bitmap = (void *)sd + sizeof(*sd);
|
||||
return to_cpumask(bitmap);
|
||||
}
|
||||
|
||||
extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
||||
@@ -171,7 +184,6 @@ typedef int (*sched_domain_flags_f)(void);
|
||||
|
||||
struct sd_data {
|
||||
struct sched_domain *__percpu *sd;
|
||||
struct sched_domain_shared *__percpu *sds;
|
||||
struct sched_group *__percpu *sg;
|
||||
struct sched_group_capacity *__percpu *sgc;
|
||||
};
|
||||
|
||||
@@ -406,7 +406,7 @@ do { \
|
||||
schedule())
|
||||
|
||||
/**
|
||||
* wait_var_event_killable - wait for a variable to be updated and notified
|
||||
* wait_var_event_interruptible - wait for a variable to be updated and notified
|
||||
* @var: the address of variable being waited on
|
||||
* @condition: the condition to wait for
|
||||
*
|
||||
@@ -492,7 +492,7 @@ do { \
|
||||
* wait_var_event_mutex - wait for a variable to be updated under a mutex
|
||||
* @var: the address of the variable being waited on
|
||||
* @condition: condition to wait for
|
||||
* @mutex: the mutex which protects updates to the variable
|
||||
* @lock: the mutex which protects updates to the variable
|
||||
*
|
||||
* Wait for a condition which can only be reliably tested while holding
|
||||
* a mutex. The variables assessed in the condition will normal be
|
||||
|
||||
@@ -149,4 +149,7 @@ struct clone_args {
|
||||
SCHED_FLAG_KEEP_ALL | \
|
||||
SCHED_FLAG_UTIL_CLAMP)
|
||||
|
||||
/* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */
|
||||
#define SCHED_GETATTR_FLAG_DL_DYNAMIC 0x01
|
||||
|
||||
#endif /* _UAPI_LINUX_SCHED_H */
|
||||
|
||||
@@ -169,6 +169,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
|
||||
.journal_info = NULL,
|
||||
INIT_CPU_TIMERS(init_task)
|
||||
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
|
||||
.blocked_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.blocked_lock),
|
||||
.timer_slack_ns = 50000, /* 50 usec default slack */
|
||||
.thread_pid = &init_struct_pid,
|
||||
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
|
||||
|
||||
@@ -2113,6 +2113,7 @@ __latent_entropy struct task_struct *copy_process(
|
||||
ftrace_graph_init_task(p);
|
||||
|
||||
rt_mutex_init_task(p);
|
||||
raw_spin_lock_init(&p->blocked_lock);
|
||||
|
||||
lockdep_assert_irqs_enabled();
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
|
||||
@@ -53,13 +53,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
|
||||
lockdep_assert_held(&lock->wait_lock);
|
||||
|
||||
/* Current thread can't be already blocked (since it's executing!) */
|
||||
DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
|
||||
DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task));
|
||||
}
|
||||
|
||||
void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct mutex *blocked_on = __get_task_blocked_on(task);
|
||||
struct mutex *blocked_on = get_task_blocked_on(task);
|
||||
|
||||
DEBUG_LOCKS_WARN_ON(waiter->task != task);
|
||||
DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
|
||||
|
||||
@@ -674,6 +674,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
|
||||
goto err_early_kill;
|
||||
}
|
||||
|
||||
raw_spin_lock(¤t->blocked_lock);
|
||||
__set_task_blocked_on(current, lock);
|
||||
set_current_state(state);
|
||||
trace_contention_begin(lock, LCB_F_MUTEX);
|
||||
@@ -687,8 +688,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
|
||||
* the handoff.
|
||||
*/
|
||||
if (__mutex_trylock(lock))
|
||||
goto acquired;
|
||||
break;
|
||||
|
||||
raw_spin_unlock(¤t->blocked_lock);
|
||||
/*
|
||||
* Check for signals and kill conditions while holding
|
||||
* wait_lock. This ensures the lock cancellation is ordered
|
||||
@@ -711,12 +713,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
|
||||
|
||||
first = lock->first_waiter == &waiter;
|
||||
|
||||
raw_spin_lock_irqsave(&lock->wait_lock, flags);
|
||||
raw_spin_lock(¤t->blocked_lock);
|
||||
/*
|
||||
* As we likely have been woken up by task
|
||||
* that has cleared our blocked_on state, re-set
|
||||
* it to the lock we are trying to acquire.
|
||||
*/
|
||||
set_task_blocked_on(current, lock);
|
||||
__set_task_blocked_on(current, lock);
|
||||
set_current_state(state);
|
||||
/*
|
||||
* Here we order against unlock; we must either see it change
|
||||
@@ -727,25 +731,33 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
|
||||
break;
|
||||
|
||||
if (first) {
|
||||
trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
|
||||
bool opt_acquired;
|
||||
|
||||
/*
|
||||
* mutex_optimistic_spin() can call schedule(), so
|
||||
* clear blocked on so we don't become unselectable
|
||||
* we need to release these locks before calling it,
|
||||
* and clear blocked on so we don't become unselectable
|
||||
* to run.
|
||||
*/
|
||||
clear_task_blocked_on(current, lock);
|
||||
if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
|
||||
__clear_task_blocked_on(current, lock);
|
||||
raw_spin_unlock(¤t->blocked_lock);
|
||||
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
|
||||
|
||||
trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
|
||||
opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter);
|
||||
|
||||
raw_spin_lock_irqsave(&lock->wait_lock, flags);
|
||||
raw_spin_lock(¤t->blocked_lock);
|
||||
__set_task_blocked_on(current, lock);
|
||||
|
||||
if (opt_acquired)
|
||||
break;
|
||||
set_task_blocked_on(current, lock);
|
||||
trace_contention_begin(lock, LCB_F_MUTEX);
|
||||
}
|
||||
|
||||
raw_spin_lock_irqsave(&lock->wait_lock, flags);
|
||||
}
|
||||
raw_spin_lock_irqsave(&lock->wait_lock, flags);
|
||||
acquired:
|
||||
__clear_task_blocked_on(current, lock);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
raw_spin_unlock(¤t->blocked_lock);
|
||||
|
||||
if (ww_ctx) {
|
||||
/*
|
||||
@@ -773,11 +785,11 @@ skip_wait:
|
||||
return 0;
|
||||
|
||||
err:
|
||||
__clear_task_blocked_on(current, lock);
|
||||
clear_task_blocked_on(current, lock);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
__mutex_remove_waiter(lock, &waiter);
|
||||
err_early_kill:
|
||||
WARN_ON(__get_task_blocked_on(current));
|
||||
WARN_ON(get_task_blocked_on(current));
|
||||
trace_contention_end(lock, ret);
|
||||
raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
|
||||
debug_mutex_free_waiter(&waiter);
|
||||
@@ -993,7 +1005,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
|
||||
next = waiter->task;
|
||||
|
||||
debug_mutex_wake_waiter(lock, waiter);
|
||||
__clear_task_blocked_on(next, lock);
|
||||
set_task_blocked_on_waking(next, lock);
|
||||
wake_q_add(&wake_q, next);
|
||||
}
|
||||
|
||||
|
||||
@@ -48,6 +48,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
|
||||
return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
|
||||
}
|
||||
|
||||
static inline struct mutex *get_task_blocked_on(struct task_struct *p)
|
||||
{
|
||||
guard(raw_spinlock_irqsave)(&p->blocked_lock);
|
||||
return __get_task_blocked_on(p);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_MUTEXES
|
||||
extern void debug_mutex_lock_common(struct mutex *lock,
|
||||
struct mutex_waiter *waiter);
|
||||
|
||||
@@ -290,11 +290,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
|
||||
debug_mutex_wake_waiter(lock, waiter);
|
||||
#endif
|
||||
/*
|
||||
* When waking up the task to die, be sure to clear the
|
||||
* blocked_on pointer. Otherwise we can see circular
|
||||
* blocked_on relationships that can't resolve.
|
||||
* When waking up the task to die, be sure to set the
|
||||
* blocked_on to PROXY_WAKING. Otherwise we can see
|
||||
* circular blocked_on relationships that can't resolve.
|
||||
*/
|
||||
__clear_task_blocked_on(waiter->task, lock);
|
||||
set_task_blocked_on_waking(waiter->task, lock);
|
||||
wake_q_add(wake_q, waiter->task);
|
||||
}
|
||||
|
||||
@@ -345,15 +345,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
|
||||
*/
|
||||
if (owner != current) {
|
||||
/*
|
||||
* When waking up the task to wound, be sure to clear the
|
||||
* blocked_on pointer. Otherwise we can see circular
|
||||
* blocked_on relationships that can't resolve.
|
||||
* When waking up the task to wound, be sure to set the
|
||||
* blocked_on to PROXY_WAKING. Otherwise we can see
|
||||
* circular blocked_on relationships that can't resolve.
|
||||
*
|
||||
* NOTE: We pass NULL here instead of lock, because we
|
||||
* are waking the mutex owner, who may be currently
|
||||
* blocked on a different mutex.
|
||||
*/
|
||||
__clear_task_blocked_on(owner, NULL);
|
||||
set_task_blocked_on_waking(owner, NULL);
|
||||
wake_q_add(wake_q, owner);
|
||||
}
|
||||
return true;
|
||||
|
||||
@@ -687,11 +687,6 @@ bool raw_spin_rq_trylock(struct rq *rq)
|
||||
}
|
||||
}
|
||||
|
||||
void raw_spin_rq_unlock(struct rq *rq)
|
||||
{
|
||||
raw_spin_unlock(rq_lockp(rq));
|
||||
}
|
||||
|
||||
/*
|
||||
* double_rq_lock - safely lock two runqueues
|
||||
*/
|
||||
@@ -3905,6 +3900,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
|
||||
|
||||
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
|
||||
{
|
||||
int this_cpu = smp_processor_id();
|
||||
|
||||
/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
|
||||
if (!scx_allow_ttwu_queue(p))
|
||||
return false;
|
||||
@@ -3929,10 +3926,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
|
||||
* If the CPU does not share cache, then queue the task on the
|
||||
* remote rqs wakelist to avoid accessing remote data.
|
||||
*/
|
||||
if (!cpus_share_cache(smp_processor_id(), cpu))
|
||||
if (!cpus_share_cache(this_cpu, cpu))
|
||||
return true;
|
||||
|
||||
if (cpu == smp_processor_id())
|
||||
if (cpu == this_cpu)
|
||||
return false;
|
||||
|
||||
/*
|
||||
@@ -4796,7 +4793,7 @@ void sched_post_fork(struct task_struct *p)
|
||||
scx_post_fork(p);
|
||||
}
|
||||
|
||||
unsigned long to_ratio(u64 period, u64 runtime)
|
||||
u64 to_ratio(u64 period, u64 runtime)
|
||||
{
|
||||
if (runtime == RUNTIME_INF)
|
||||
return BW_UNIT;
|
||||
@@ -4971,6 +4968,34 @@ static inline void finish_task(struct task_struct *prev)
|
||||
smp_store_release(&prev->on_cpu, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Only called from __schedule context
|
||||
*
|
||||
* There are some cases where we are going to re-do the action
|
||||
* that added the balance callbacks. We may not be in a state
|
||||
* where we can run them, so just zap them so they can be
|
||||
* properly re-added on the next time around. This is similar
|
||||
* handling to running the callbacks, except we just don't call
|
||||
* them.
|
||||
*/
|
||||
static void zap_balance_callbacks(struct rq *rq)
|
||||
{
|
||||
struct balance_callback *next, *head;
|
||||
bool found = false;
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
head = rq->balance_callback;
|
||||
while (head) {
|
||||
if (head == &balance_push_callback)
|
||||
found = true;
|
||||
next = head->next;
|
||||
head->next = NULL;
|
||||
head = next;
|
||||
}
|
||||
rq->balance_callback = found ? &balance_push_callback : NULL;
|
||||
}
|
||||
|
||||
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
|
||||
{
|
||||
void (*func)(struct rq *rq);
|
||||
@@ -5740,7 +5765,7 @@ static void sched_tick_remote(struct work_struct *work)
|
||||
os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
|
||||
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
|
||||
if (os == TICK_SCHED_REMOTE_RUNNING)
|
||||
queue_delayed_work(system_unbound_wq, dwork, HZ);
|
||||
queue_delayed_work(system_dfl_wq, dwork, HZ);
|
||||
}
|
||||
|
||||
static void sched_tick_start(int cpu)
|
||||
@@ -5759,7 +5784,7 @@ static void sched_tick_start(int cpu)
|
||||
if (os == TICK_SCHED_REMOTE_OFFLINE) {
|
||||
twork->cpu = cpu;
|
||||
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
|
||||
queue_delayed_work(system_unbound_wq, &twork->work, HZ);
|
||||
queue_delayed_work(system_dfl_wq, &twork->work, HZ);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6557,6 +6582,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
|
||||
if (signal_pending_state(task_state, p)) {
|
||||
WRITE_ONCE(p->__state, TASK_RUNNING);
|
||||
*task_state_p = TASK_RUNNING;
|
||||
set_task_blocked_on_waking(p, NULL);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -6594,6 +6621,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_PROXY_EXEC
|
||||
static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
|
||||
{
|
||||
unsigned int wake_cpu;
|
||||
|
||||
/*
|
||||
* Since we are enqueuing a blocked task on a cpu it may
|
||||
* not be able to run on, preserve wake_cpu when we
|
||||
* __set_task_cpu so we can return the task to where it
|
||||
* was previously runnable.
|
||||
*/
|
||||
wake_cpu = p->wake_cpu;
|
||||
__set_task_cpu(p, cpu);
|
||||
p->wake_cpu = wake_cpu;
|
||||
}
|
||||
|
||||
static inline struct task_struct *proxy_resched_idle(struct rq *rq)
|
||||
{
|
||||
put_prev_set_next_task(rq, rq->donor, rq->idle);
|
||||
@@ -6602,7 +6644,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq)
|
||||
return rq->idle;
|
||||
}
|
||||
|
||||
static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
|
||||
static bool proxy_deactivate(struct rq *rq, struct task_struct *donor)
|
||||
{
|
||||
unsigned long state = READ_ONCE(donor->__state);
|
||||
|
||||
@@ -6622,17 +6664,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
|
||||
return try_to_block_task(rq, donor, &state, true);
|
||||
}
|
||||
|
||||
static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
|
||||
static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf)
|
||||
__releases(__rq_lockp(rq))
|
||||
{
|
||||
if (!__proxy_deactivate(rq, donor)) {
|
||||
/*
|
||||
* The class scheduler may have queued a balance callback
|
||||
* from pick_next_task() called earlier.
|
||||
*
|
||||
* So here we have to zap callbacks before unlocking the rq
|
||||
* as another CPU may jump in and call sched_balance_rq
|
||||
* which can trip the warning in rq_pin_lock() if we
|
||||
* leave callbacks set.
|
||||
*
|
||||
* After we later reaquire the rq lock, we will force __schedule()
|
||||
* to pick_again, so the callbacks will get re-established.
|
||||
*/
|
||||
zap_balance_callbacks(rq);
|
||||
rq_unpin_lock(rq, rf);
|
||||
raw_spin_rq_unlock(rq);
|
||||
}
|
||||
|
||||
static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf)
|
||||
__acquires(__rq_lockp(rq))
|
||||
{
|
||||
raw_spin_rq_lock(rq);
|
||||
rq_repin_lock(rq, rf);
|
||||
update_rq_clock(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the blocked-on relationship crosses CPUs, migrate @p to the
|
||||
* owner's CPU.
|
||||
*
|
||||
* This is because we must respect the CPU affinity of execution
|
||||
* contexts (owner) but we can ignore affinity for scheduling
|
||||
* contexts (@p). So we have to move scheduling contexts towards
|
||||
* potential execution contexts.
|
||||
*
|
||||
* Note: The owner can disappear, but simply migrate to @target_cpu
|
||||
* and leave that CPU to sort things out.
|
||||
*/
|
||||
static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
|
||||
struct task_struct *p, int target_cpu)
|
||||
__must_hold(__rq_lockp(rq))
|
||||
{
|
||||
struct rq *target_rq = cpu_rq(target_cpu);
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
WARN_ON(p == rq->curr);
|
||||
/*
|
||||
* Since we are migrating a blocked donor, it could be rq->donor,
|
||||
* and we want to make sure there aren't any references from this
|
||||
* rq to it before we drop the lock. This avoids another cpu
|
||||
* jumping in and grabbing the rq lock and referencing rq->donor
|
||||
* or cfs_rq->curr, etc after we have migrated it to another cpu,
|
||||
* and before we pick_again in __schedule.
|
||||
*
|
||||
* So call proxy_resched_idle() to drop the rq->donor references
|
||||
* before we release the lock.
|
||||
*/
|
||||
proxy_resched_idle(rq);
|
||||
|
||||
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
|
||||
proxy_set_task_cpu(p, target_cpu);
|
||||
|
||||
proxy_release_rq_lock(rq, rf);
|
||||
|
||||
attach_one_task(target_rq, p);
|
||||
|
||||
proxy_reacquire_rq_lock(rq, rf);
|
||||
}
|
||||
|
||||
static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
|
||||
struct task_struct *p)
|
||||
__must_hold(__rq_lockp(rq))
|
||||
{
|
||||
struct rq *task_rq, *target_rq = NULL;
|
||||
int cpu, wake_flag = WF_TTWU;
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
WARN_ON(p == rq->curr);
|
||||
|
||||
if (p == rq->donor)
|
||||
proxy_resched_idle(rq);
|
||||
|
||||
proxy_release_rq_lock(rq, rf);
|
||||
/*
|
||||
* We drop the rq lock, and re-grab task_rq_lock to get
|
||||
* the pi_lock (needed for select_task_rq) as well.
|
||||
*/
|
||||
scoped_guard (task_rq_lock, p) {
|
||||
task_rq = scope.rq;
|
||||
|
||||
/*
|
||||
* XXX: For now, if deactivation failed, set donor
|
||||
* as unblocked, as we aren't doing proxy-migrations
|
||||
* yet (more logic will be needed then).
|
||||
* Since we let go of the rq lock, the task may have been
|
||||
* woken or migrated to another rq before we got the
|
||||
* task_rq_lock. So re-check we're on the same RQ. If
|
||||
* not, the task has already been migrated and that CPU
|
||||
* will handle any futher migrations.
|
||||
*/
|
||||
donor->blocked_on = NULL;
|
||||
if (task_rq != rq)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Similarly, if we've been dequeued, someone else will
|
||||
* wake us
|
||||
*/
|
||||
if (!task_on_rq_queued(p))
|
||||
break;
|
||||
|
||||
/*
|
||||
* Since we should only be calling here from __schedule()
|
||||
* -> find_proxy_task(), no one else should have
|
||||
* assigned current out from under us. But check and warn
|
||||
* if we see this, then bail.
|
||||
*/
|
||||
if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) {
|
||||
WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n",
|
||||
__func__, cpu_of(task_rq),
|
||||
p->comm, p->pid, p->on_cpu);
|
||||
break;
|
||||
}
|
||||
|
||||
update_rq_clock(task_rq);
|
||||
deactivate_task(task_rq, p, DEQUEUE_NOCLOCK);
|
||||
cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
|
||||
set_task_cpu(p, cpu);
|
||||
target_rq = cpu_rq(cpu);
|
||||
clear_task_blocked_on(p, NULL);
|
||||
}
|
||||
return NULL;
|
||||
|
||||
if (target_rq)
|
||||
attach_one_task(target_rq, p);
|
||||
|
||||
proxy_reacquire_rq_lock(rq, rf);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -6646,31 +6811,41 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
|
||||
* p->pi_lock
|
||||
* rq->lock
|
||||
* mutex->wait_lock
|
||||
* p->blocked_lock
|
||||
*
|
||||
* Returns the task that is going to be used as execution context (the one
|
||||
* that is actually going to be run on cpu_of(rq)).
|
||||
*/
|
||||
static struct task_struct *
|
||||
find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
|
||||
__must_hold(__rq_lockp(rq))
|
||||
{
|
||||
struct task_struct *owner = NULL;
|
||||
bool curr_in_chain = false;
|
||||
int this_cpu = cpu_of(rq);
|
||||
struct task_struct *p;
|
||||
struct mutex *mutex;
|
||||
int owner_cpu;
|
||||
|
||||
/* Follow blocked_on chain. */
|
||||
for (p = donor; task_is_blocked(p); p = owner) {
|
||||
mutex = p->blocked_on;
|
||||
/* Something changed in the chain, so pick again */
|
||||
if (!mutex)
|
||||
return NULL;
|
||||
for (p = donor; (mutex = p->blocked_on); p = owner) {
|
||||
/* if its PROXY_WAKING, do return migration or run if current */
|
||||
if (mutex == PROXY_WAKING) {
|
||||
if (task_current(rq, p)) {
|
||||
clear_task_blocked_on(p, PROXY_WAKING);
|
||||
return p;
|
||||
}
|
||||
goto force_return;
|
||||
}
|
||||
|
||||
/*
|
||||
* By taking mutex->wait_lock we hold off concurrent mutex_unlock()
|
||||
* and ensure @owner sticks around.
|
||||
*/
|
||||
guard(raw_spinlock)(&mutex->wait_lock);
|
||||
guard(raw_spinlock)(&p->blocked_lock);
|
||||
|
||||
/* Check again that p is blocked with wait_lock held */
|
||||
/* Check again that p is blocked with blocked_lock held */
|
||||
if (mutex != __get_task_blocked_on(p)) {
|
||||
/*
|
||||
* Something changed in the blocked_on chain and
|
||||
@@ -6681,20 +6856,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (task_current(rq, p))
|
||||
curr_in_chain = true;
|
||||
|
||||
owner = __mutex_owner(mutex);
|
||||
if (!owner) {
|
||||
__clear_task_blocked_on(p, mutex);
|
||||
return p;
|
||||
/*
|
||||
* If there is no owner, either clear blocked_on
|
||||
* and return p (if it is current and safe to
|
||||
* just run on this rq), or return-migrate the task.
|
||||
*/
|
||||
if (task_current(rq, p)) {
|
||||
__clear_task_blocked_on(p, NULL);
|
||||
return p;
|
||||
}
|
||||
goto force_return;
|
||||
}
|
||||
|
||||
if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
|
||||
/* XXX Don't handle blocked owners/delayed dequeue yet */
|
||||
return proxy_deactivate(rq, donor);
|
||||
if (curr_in_chain)
|
||||
return proxy_resched_idle(rq);
|
||||
goto deactivate;
|
||||
}
|
||||
|
||||
if (task_cpu(owner) != this_cpu) {
|
||||
/* XXX Don't handle migrations yet */
|
||||
return proxy_deactivate(rq, donor);
|
||||
owner_cpu = task_cpu(owner);
|
||||
if (owner_cpu != this_cpu) {
|
||||
/*
|
||||
* @owner can disappear, simply migrate to @owner_cpu
|
||||
* and leave that CPU to sort things out.
|
||||
*/
|
||||
if (curr_in_chain)
|
||||
return proxy_resched_idle(rq);
|
||||
goto migrate_task;
|
||||
}
|
||||
|
||||
if (task_on_rq_migrating(owner)) {
|
||||
@@ -6751,9 +6945,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
|
||||
* guarantee its existence, as per ttwu_remote().
|
||||
*/
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(owner && !owner->on_rq);
|
||||
return owner;
|
||||
|
||||
deactivate:
|
||||
if (proxy_deactivate(rq, donor))
|
||||
return NULL;
|
||||
/* If deactivate fails, force return */
|
||||
p = donor;
|
||||
force_return:
|
||||
proxy_force_return(rq, rf, p);
|
||||
return NULL;
|
||||
migrate_task:
|
||||
proxy_migrate_task(rq, rf, p, owner_cpu);
|
||||
return NULL;
|
||||
}
|
||||
#else /* SCHED_PROXY_EXEC */
|
||||
static struct task_struct *
|
||||
@@ -6764,23 +6969,6 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
|
||||
}
|
||||
#endif /* SCHED_PROXY_EXEC */
|
||||
|
||||
static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
|
||||
{
|
||||
if (!sched_proxy_exec())
|
||||
return;
|
||||
/*
|
||||
* pick_next_task() calls set_next_task() on the chosen task
|
||||
* at some point, which ensures it is not push/pullable.
|
||||
* However, the chosen/donor task *and* the mutex owner form an
|
||||
* atomic pair wrt push/pull.
|
||||
*
|
||||
* Make sure owner we run is not pushable. Unfortunately we can
|
||||
* only deal with that by means of a dequeue/enqueue cycle. :-/
|
||||
*/
|
||||
dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
|
||||
enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
|
||||
}
|
||||
|
||||
/*
|
||||
* __schedule() is the main scheduler function.
|
||||
*
|
||||
@@ -6907,16 +7095,45 @@ static void __sched notrace __schedule(int sched_mode)
|
||||
}
|
||||
|
||||
pick_again:
|
||||
assert_balance_callbacks_empty(rq);
|
||||
next = pick_next_task(rq, rq->donor, &rf);
|
||||
rq_set_donor(rq, next);
|
||||
rq->next_class = next->sched_class;
|
||||
if (unlikely(task_is_blocked(next))) {
|
||||
next = find_proxy_task(rq, next, &rf);
|
||||
if (!next)
|
||||
goto pick_again;
|
||||
if (next == rq->idle)
|
||||
goto keep_resched;
|
||||
if (sched_proxy_exec()) {
|
||||
struct task_struct *prev_donor = rq->donor;
|
||||
|
||||
rq_set_donor(rq, next);
|
||||
if (unlikely(next->blocked_on)) {
|
||||
next = find_proxy_task(rq, next, &rf);
|
||||
if (!next) {
|
||||
zap_balance_callbacks(rq);
|
||||
goto pick_again;
|
||||
}
|
||||
if (next == rq->idle) {
|
||||
zap_balance_callbacks(rq);
|
||||
goto keep_resched;
|
||||
}
|
||||
}
|
||||
if (rq->donor == prev_donor && prev != next) {
|
||||
struct task_struct *donor = rq->donor;
|
||||
/*
|
||||
* When transitioning like:
|
||||
*
|
||||
* prev next
|
||||
* donor: B B
|
||||
* curr: A B or C
|
||||
*
|
||||
* then put_prev_set_next_task() will not have done
|
||||
* anything, since B == B. However, A might have
|
||||
* missed a RT/DL balance opportunity due to being
|
||||
* on_cpu.
|
||||
*/
|
||||
donor->sched_class->put_prev_task(rq, donor, donor);
|
||||
donor->sched_class->set_next_task(rq, donor, true);
|
||||
}
|
||||
} else {
|
||||
rq_set_donor(rq, next);
|
||||
}
|
||||
|
||||
picked:
|
||||
clear_tsk_need_resched(prev);
|
||||
clear_preempt_need_resched();
|
||||
@@ -6932,9 +7149,6 @@ keep_resched:
|
||||
*/
|
||||
RCU_INIT_POINTER(rq->curr, next);
|
||||
|
||||
if (!task_current_donor(rq, next))
|
||||
proxy_tag_curr(rq, next);
|
||||
|
||||
/*
|
||||
* The membarrier system call requires each architecture
|
||||
* to have a full memory barrier after updating
|
||||
@@ -6968,10 +7182,6 @@ keep_resched:
|
||||
/* Also unlocks the rq: */
|
||||
rq = context_switch(rq, prev, next, &rf);
|
||||
} else {
|
||||
/* In case next was already curr but just got blocked_donor */
|
||||
if (!task_current_donor(rq, next))
|
||||
proxy_tag_curr(rq, next);
|
||||
|
||||
rq_unpin_lock(rq, &rf);
|
||||
__balance_callbacks(rq, NULL);
|
||||
hrtick_schedule_exit(rq);
|
||||
|
||||
@@ -2142,10 +2142,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
|
||||
int flags)
|
||||
{
|
||||
struct task_struct *p = dl_task_of(dl_se);
|
||||
struct rq *rq = rq_of_dl_rq(dl_rq);
|
||||
|
||||
if (!schedstat_enabled())
|
||||
return;
|
||||
|
||||
if (p != rq->curr)
|
||||
update_stats_wait_end_dl(dl_rq, dl_se);
|
||||
|
||||
if ((flags & DEQUEUE_SLEEP)) {
|
||||
unsigned int state;
|
||||
|
||||
@@ -2801,12 +2805,26 @@ static int find_later_rq(struct task_struct *task)
|
||||
|
||||
static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
|
||||
{
|
||||
struct task_struct *p;
|
||||
struct task_struct *i, *p = NULL;
|
||||
struct rb_node *next_node;
|
||||
|
||||
if (!has_pushable_dl_tasks(rq))
|
||||
return NULL;
|
||||
|
||||
p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
|
||||
next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
|
||||
while (next_node) {
|
||||
i = __node_2_pdl(next_node);
|
||||
/* make sure task isn't on_cpu (possible with proxy-exec) */
|
||||
if (!task_on_cpu(rq, i)) {
|
||||
p = i;
|
||||
break;
|
||||
}
|
||||
|
||||
next_node = rb_next(next_node);
|
||||
}
|
||||
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
WARN_ON_ONCE(rq->cpu != task_cpu(p));
|
||||
WARN_ON_ONCE(task_current(rq, p));
|
||||
@@ -3613,13 +3631,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
|
||||
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
|
||||
}
|
||||
|
||||
void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
|
||||
void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
|
||||
{
|
||||
struct sched_dl_entity *dl_se = &p->dl;
|
||||
struct rq *rq = task_rq(p);
|
||||
u64 adj_deadline;
|
||||
|
||||
attr->sched_priority = p->rt_priority;
|
||||
attr->sched_runtime = dl_se->dl_runtime;
|
||||
attr->sched_deadline = dl_se->dl_deadline;
|
||||
if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) {
|
||||
guard(raw_spinlock_irq)(&rq->__lock);
|
||||
update_rq_clock(rq);
|
||||
if (task_current(rq, p))
|
||||
update_curr_dl(rq);
|
||||
|
||||
attr->sched_runtime = dl_se->runtime;
|
||||
adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns();
|
||||
attr->sched_deadline = adj_deadline;
|
||||
} else {
|
||||
attr->sched_runtime = dl_se->dl_runtime;
|
||||
attr->sched_deadline = dl_se->dl_deadline;
|
||||
}
|
||||
attr->sched_period = dl_se->dl_period;
|
||||
attr->sched_flags &= ~SCHED_DL_FLAGS;
|
||||
attr->sched_flags |= dl_se->flags;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
*/
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/log2.h>
|
||||
#include "sched.h"
|
||||
|
||||
/*
|
||||
@@ -901,11 +902,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
||||
|
||||
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
{
|
||||
s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
|
||||
s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
|
||||
s64 zero_vruntime = -1, sum_w_vruntime = -1;
|
||||
u64 avruntime;
|
||||
struct sched_entity *last, *first, *root;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned int sum_shift;
|
||||
unsigned long flags;
|
||||
u64 sum_weight;
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
SEQ_printf(m, "\n");
|
||||
@@ -926,6 +930,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
if (last)
|
||||
right_vruntime = last->vruntime;
|
||||
zero_vruntime = cfs_rq->zero_vruntime;
|
||||
sum_w_vruntime = cfs_rq->sum_w_vruntime;
|
||||
sum_weight = cfs_rq->sum_weight;
|
||||
sum_shift = cfs_rq->sum_shift;
|
||||
avruntime = avg_vruntime(cfs_rq);
|
||||
raw_spin_rq_unlock_irqrestore(rq, flags);
|
||||
|
||||
@@ -935,6 +942,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
||||
SPLIT_NS(left_vruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
|
||||
SPLIT_NS(zero_vruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
|
||||
sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
|
||||
SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight",
|
||||
sum_weight);
|
||||
SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift);
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
|
||||
SPLIT_NS(avruntime));
|
||||
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
|
||||
|
||||
@@ -2837,7 +2837,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
|
||||
queue_delayed_work(system_dfl_wq, to_delayed_work(work),
|
||||
READ_ONCE(scx_watchdog_timeout) / 2);
|
||||
}
|
||||
|
||||
@@ -5164,7 +5164,7 @@ static void scx_enable_workfn(struct kthread_work *work)
|
||||
|
||||
WRITE_ONCE(scx_watchdog_timeout, timeout);
|
||||
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
|
||||
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
|
||||
queue_delayed_work(system_dfl_wq, &scx_watchdog_work,
|
||||
READ_ONCE(scx_watchdog_timeout) / 2);
|
||||
|
||||
/*
|
||||
|
||||
@@ -225,6 +225,7 @@ void __init sched_init_granularity(void)
|
||||
update_sysctl();
|
||||
}
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
#define WMULT_CONST (~0U)
|
||||
#define WMULT_SHIFT 32
|
||||
|
||||
@@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
|
||||
|
||||
return mul_u64_u32_shr(delta_exec, fact, shift);
|
||||
}
|
||||
#else
|
||||
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
|
||||
{
|
||||
return (delta_exec * weight) / lw->weight;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* delta /= w
|
||||
@@ -665,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
* Since zero_vruntime closely tracks the per-task service, these
|
||||
* deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
|
||||
* induced in the system due to quantisation.
|
||||
*
|
||||
* Also, we use scale_load_down() to reduce the size.
|
||||
*
|
||||
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
|
||||
*/
|
||||
static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
|
||||
{
|
||||
#ifdef CONFIG_64BIT
|
||||
if (cfs_rq->sum_shift)
|
||||
w = max(2UL, w >> cfs_rq->sum_shift);
|
||||
#endif
|
||||
return w;
|
||||
}
|
||||
|
||||
static inline void
|
||||
__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
|
||||
s64 w_vruntime, key = entity_key(cfs_rq, se);
|
||||
|
||||
w_vruntime = key * weight;
|
||||
WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
|
||||
|
||||
cfs_rq->sum_w_vruntime += w_vruntime;
|
||||
cfs_rq->sum_weight += weight;
|
||||
}
|
||||
|
||||
static void
|
||||
sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
unsigned long weight;
|
||||
s64 key, tmp;
|
||||
|
||||
again:
|
||||
weight = avg_vruntime_weight(cfs_rq, se->load.weight);
|
||||
key = entity_key(cfs_rq, se);
|
||||
|
||||
if (check_mul_overflow(key, weight, &key))
|
||||
goto overflow;
|
||||
|
||||
if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
|
||||
goto overflow;
|
||||
|
||||
cfs_rq->sum_w_vruntime = tmp;
|
||||
cfs_rq->sum_weight += weight;
|
||||
return;
|
||||
|
||||
overflow:
|
||||
/*
|
||||
* There's gotta be a limit -- if we're still failing at this point
|
||||
* there's really nothing much to be done about things.
|
||||
*/
|
||||
BUG_ON(cfs_rq->sum_shift >= 10);
|
||||
cfs_rq->sum_shift++;
|
||||
|
||||
/*
|
||||
* Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
|
||||
*/
|
||||
cfs_rq->sum_w_vruntime = 0;
|
||||
cfs_rq->sum_weight = 0;
|
||||
|
||||
for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
|
||||
node; node = rb_next(node))
|
||||
__sum_w_vruntime_add(cfs_rq, __node_2_se(node));
|
||||
|
||||
goto again;
|
||||
}
|
||||
|
||||
static void
|
||||
sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
unsigned long weight = scale_load_down(se->load.weight);
|
||||
s64 key = entity_key(cfs_rq, se);
|
||||
if (sched_feat(PARANOID_AVG))
|
||||
return sum_w_vruntime_add_paranoid(cfs_rq, se);
|
||||
|
||||
cfs_rq->sum_w_vruntime += key * weight;
|
||||
cfs_rq->sum_weight += weight;
|
||||
__sum_w_vruntime_add(cfs_rq, se);
|
||||
}
|
||||
|
||||
static void
|
||||
sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
unsigned long weight = scale_load_down(se->load.weight);
|
||||
unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
|
||||
s64 key = entity_key(cfs_rq, se);
|
||||
|
||||
cfs_rq->sum_w_vruntime -= key * weight;
|
||||
@@ -725,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
||||
s64 runtime = cfs_rq->sum_w_vruntime;
|
||||
|
||||
if (curr) {
|
||||
unsigned long w = scale_load_down(curr->load.weight);
|
||||
unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
|
||||
|
||||
runtime += entity_key(cfs_rq, curr) * w;
|
||||
weight += w;
|
||||
@@ -735,7 +800,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
||||
if (runtime < 0)
|
||||
runtime -= (weight - 1);
|
||||
|
||||
delta = div_s64(runtime, weight);
|
||||
delta = div64_long(runtime, weight);
|
||||
} else if (curr) {
|
||||
/*
|
||||
* When there is but one element, it is the average.
|
||||
@@ -764,17 +829,44 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
|
||||
*
|
||||
* -r_max < lag < max(r_max, q)
|
||||
*/
|
||||
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
|
||||
{
|
||||
u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
|
||||
s64 vlag, limit;
|
||||
|
||||
WARN_ON_ONCE(!se->on_rq);
|
||||
|
||||
vlag = avg_vruntime(cfs_rq) - se->vruntime;
|
||||
vlag = avruntime - se->vruntime;
|
||||
limit = calc_delta_fair(max_slice, se);
|
||||
|
||||
se->vlag = clamp(vlag, -limit, limit);
|
||||
return clamp(vlag, -limit, limit);
|
||||
}
|
||||
|
||||
/*
|
||||
* Delayed dequeue aims to reduce the negative lag of a dequeued task. While
|
||||
* updating the lag of an entity, check that negative lag didn't increase
|
||||
* during the delayed dequeue period which would be unfair.
|
||||
* Similarly, check that the entity didn't gain positive lag when DELAY_ZERO
|
||||
* is set.
|
||||
*
|
||||
* Return true if the lag has been adjusted.
|
||||
*/
|
||||
static __always_inline
|
||||
bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
|
||||
bool ret;
|
||||
|
||||
WARN_ON_ONCE(!se->on_rq);
|
||||
|
||||
if (se->sched_delayed) {
|
||||
/* previous vlag < 0 otherwise se would not be delayed */
|
||||
vlag = max(vlag, se->vlag);
|
||||
if (sched_feat(DELAY_ZERO))
|
||||
vlag = min(vlag, 0);
|
||||
}
|
||||
ret = (vlag == se->vlag);
|
||||
se->vlag = vlag;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -801,7 +893,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
|
||||
long load = cfs_rq->sum_weight;
|
||||
|
||||
if (curr && curr->on_rq) {
|
||||
unsigned long weight = scale_load_down(curr->load.weight);
|
||||
unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
|
||||
|
||||
avg += entity_key(cfs_rq, curr) * weight;
|
||||
load += weight;
|
||||
@@ -1024,7 +1116,7 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
|
||||
/*
|
||||
* Picking the ->next buddy will affect latency but not fairness.
|
||||
*/
|
||||
if (sched_feat(PICK_BUDDY) &&
|
||||
if (sched_feat(PICK_BUDDY) && protect &&
|
||||
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
|
||||
/* ->next will never be delayed */
|
||||
WARN_ON_ONCE(cfs_rq->next->sched_delayed);
|
||||
@@ -3841,23 +3933,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
se_weight(se) * -se->avg.load_sum);
|
||||
}
|
||||
|
||||
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
|
||||
static void
|
||||
rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
|
||||
{
|
||||
unsigned long old_weight = se->load.weight;
|
||||
|
||||
/*
|
||||
* VRUNTIME
|
||||
* --------
|
||||
*
|
||||
* COROLLARY #1: The virtual runtime of the entity needs to be
|
||||
* adjusted if re-weight at !0-lag point.
|
||||
*
|
||||
* Proof: For contradiction assume this is not true, so we can
|
||||
* re-weight without changing vruntime at !0-lag point.
|
||||
*
|
||||
* Weight VRuntime Avg-VRuntime
|
||||
* before w v V
|
||||
* after w' v' V'
|
||||
*
|
||||
* Since lag needs to be preserved through re-weight:
|
||||
*
|
||||
* lag = (V - v)*w = (V'- v')*w', where v = v'
|
||||
* ==> V' = (V - v)*w/w' + v (1)
|
||||
*
|
||||
* Let W be the total weight of the entities before reweight,
|
||||
* since V' is the new weighted average of entities:
|
||||
*
|
||||
* V' = (WV + w'v - wv) / (W + w' - w) (2)
|
||||
*
|
||||
* by using (1) & (2) we obtain:
|
||||
*
|
||||
* (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
|
||||
* ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
|
||||
* ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
|
||||
* ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
|
||||
*
|
||||
* Since we are doing at !0-lag point which means V != v, we
|
||||
* can simplify (3):
|
||||
*
|
||||
* ==> W / (W + w' - w) = w / w'
|
||||
* ==> Ww' = Ww + ww' - ww
|
||||
* ==> W * (w' - w) = w * (w' - w)
|
||||
* ==> W = w (re-weight indicates w' != w)
|
||||
*
|
||||
* So the cfs_rq contains only one entity, hence vruntime of
|
||||
* the entity @v should always equal to the cfs_rq's weighted
|
||||
* average vruntime @V, which means we will always re-weight
|
||||
* at 0-lag point, thus breach assumption. Proof completed.
|
||||
*
|
||||
*
|
||||
* COROLLARY #2: Re-weight does NOT affect weighted average
|
||||
* vruntime of all the entities.
|
||||
*
|
||||
* Proof: According to corollary #1, Eq. (1) should be:
|
||||
*
|
||||
* (V - v)*w = (V' - v')*w'
|
||||
* ==> v' = V' - (V - v)*w/w' (4)
|
||||
*
|
||||
* According to the weighted average formula, we have:
|
||||
*
|
||||
* V' = (WV - wv + w'v') / (W - w + w')
|
||||
* = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
|
||||
* = (WV - wv + w'V' - Vw + wv) / (W - w + w')
|
||||
* = (WV + w'V' - Vw) / (W - w + w')
|
||||
*
|
||||
* ==> V'*(W - w + w') = WV + w'V' - Vw
|
||||
* ==> V' * (W - w) = (W - w) * V (5)
|
||||
*
|
||||
* If the entity is the only one in the cfs_rq, then reweight
|
||||
* always occurs at 0-lag point, so V won't change. Or else
|
||||
* there are other entities, hence W != w, then Eq. (5) turns
|
||||
* into V' = V. So V won't change in either case, proof done.
|
||||
*
|
||||
*
|
||||
* So according to corollary #1 & #2, the effect of re-weight
|
||||
* on vruntime should be:
|
||||
*
|
||||
* v' = V' - (V - v) * w / w' (4)
|
||||
* = V - (V - v) * w / w'
|
||||
* = V - vl * w / w'
|
||||
* = V - vl'
|
||||
*/
|
||||
se->vlag = div64_long(se->vlag * old_weight, weight);
|
||||
|
||||
/*
|
||||
* DEADLINE
|
||||
* --------
|
||||
*
|
||||
* When the weight changes, the virtual time slope changes and
|
||||
* we should adjust the relative virtual deadline accordingly.
|
||||
*
|
||||
* d' = v' + (d - v)*w/w'
|
||||
* = V' - (V - v)*w/w' + (d - v)*w/w'
|
||||
* = V - (V - v)*w/w' + (d - v)*w/w'
|
||||
* = V + (d - V)*w/w'
|
||||
*/
|
||||
if (se->rel_deadline)
|
||||
se->deadline = div64_long(se->deadline * old_weight, weight);
|
||||
|
||||
if (rel_vprot)
|
||||
se->vprot = div64_long(se->vprot * old_weight, weight);
|
||||
}
|
||||
|
||||
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
unsigned long weight)
|
||||
{
|
||||
bool curr = cfs_rq->curr == se;
|
||||
bool rel_vprot = false;
|
||||
u64 vprot;
|
||||
u64 avruntime = 0;
|
||||
|
||||
if (se->on_rq) {
|
||||
/* commit outstanding execution time */
|
||||
update_curr(cfs_rq);
|
||||
update_entity_lag(cfs_rq, se);
|
||||
se->deadline -= se->vruntime;
|
||||
avruntime = avg_vruntime(cfs_rq);
|
||||
se->vlag = entity_lag(cfs_rq, se, avruntime);
|
||||
se->deadline -= avruntime;
|
||||
se->rel_deadline = 1;
|
||||
if (curr && protect_slice(se)) {
|
||||
vprot = se->vprot - se->vruntime;
|
||||
se->vprot -= avruntime;
|
||||
rel_vprot = true;
|
||||
}
|
||||
|
||||
@@ -3868,30 +4062,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
}
|
||||
dequeue_load_avg(cfs_rq, se);
|
||||
|
||||
/*
|
||||
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
|
||||
* we need to scale se->vlag when w_i changes.
|
||||
*/
|
||||
se->vlag = div_s64(se->vlag * se->load.weight, weight);
|
||||
if (se->rel_deadline)
|
||||
se->deadline = div_s64(se->deadline * se->load.weight, weight);
|
||||
|
||||
if (rel_vprot)
|
||||
vprot = div_s64(vprot * se->load.weight, weight);
|
||||
rescale_entity(se, weight, rel_vprot);
|
||||
|
||||
update_load_set(&se->load, weight);
|
||||
|
||||
do {
|
||||
u32 divider = get_pelt_divider(&se->avg);
|
||||
|
||||
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
|
||||
} while (0);
|
||||
|
||||
enqueue_load_avg(cfs_rq, se);
|
||||
if (se->on_rq) {
|
||||
place_entity(cfs_rq, se, 0);
|
||||
if (rel_vprot)
|
||||
se->vprot = se->vruntime + vprot;
|
||||
se->vprot += avruntime;
|
||||
se->deadline += avruntime;
|
||||
se->rel_deadline = 0;
|
||||
se->vruntime = avruntime - se->vlag;
|
||||
|
||||
update_load_add(&cfs_rq->load, se->load.weight);
|
||||
if (!curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
@@ -5165,6 +5352,7 @@ static void
|
||||
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
u64 vslice, vruntime = avg_vruntime(cfs_rq);
|
||||
bool update_zero = false;
|
||||
s64 lag = 0;
|
||||
|
||||
if (!se->custom_slice)
|
||||
@@ -5181,7 +5369,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
*/
|
||||
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
|
||||
struct sched_entity *curr = cfs_rq->curr;
|
||||
unsigned long load;
|
||||
long load, weight;
|
||||
|
||||
lag = se->vlag;
|
||||
|
||||
@@ -5239,17 +5427,44 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
*/
|
||||
load = cfs_rq->sum_weight;
|
||||
if (curr && curr->on_rq)
|
||||
load += scale_load_down(curr->load.weight);
|
||||
load += avg_vruntime_weight(cfs_rq, curr->load.weight);
|
||||
|
||||
lag *= load + scale_load_down(se->load.weight);
|
||||
weight = avg_vruntime_weight(cfs_rq, se->load.weight);
|
||||
lag *= load + weight;
|
||||
if (WARN_ON_ONCE(!load))
|
||||
load = 1;
|
||||
lag = div_s64(lag, load);
|
||||
lag = div64_long(lag, load);
|
||||
|
||||
/*
|
||||
* A heavy entity (relative to the tree) will pull the
|
||||
* avg_vruntime close to its vruntime position on enqueue. But
|
||||
* the zero_vruntime point is only updated at the next
|
||||
* update_deadline()/place_entity()/update_entity_lag().
|
||||
*
|
||||
* Specifically (see the comment near avg_vruntime_weight()):
|
||||
*
|
||||
* sum_w_vruntime = \Sum (v_i - v0) * w_i
|
||||
*
|
||||
* Note that if v0 is near a light entity, both terms will be
|
||||
* small for the light entity, while in that case both terms
|
||||
* are large for the heavy entity, leading to risk of
|
||||
* overflow.
|
||||
*
|
||||
* OTOH if v0 is near the heavy entity, then the difference is
|
||||
* larger for the light entity, but the factor is small, while
|
||||
* for the heavy entity the difference is small but the factor
|
||||
* is large. Avoiding the multiplication overflow.
|
||||
*/
|
||||
if (weight > load)
|
||||
update_zero = true;
|
||||
}
|
||||
|
||||
se->vruntime = vruntime - lag;
|
||||
|
||||
if (se->rel_deadline) {
|
||||
if (update_zero)
|
||||
update_zero_vruntime(cfs_rq, -lag);
|
||||
|
||||
if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
|
||||
se->deadline += se->vruntime;
|
||||
se->rel_deadline = 0;
|
||||
return;
|
||||
@@ -5399,13 +5614,6 @@ static void clear_delayed(struct sched_entity *se)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
|
||||
{
|
||||
clear_delayed(se);
|
||||
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
|
||||
se->vlag = 0;
|
||||
}
|
||||
|
||||
static bool
|
||||
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
@@ -5431,6 +5639,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
if (sched_feat(DELAY_DEQUEUE) && delay &&
|
||||
!entity_eligible(cfs_rq, se)) {
|
||||
update_load_avg(cfs_rq, se, 0);
|
||||
update_entity_lag(cfs_rq, se);
|
||||
set_delayed(se);
|
||||
return false;
|
||||
}
|
||||
@@ -5470,7 +5679,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
update_cfs_group(se);
|
||||
|
||||
if (flags & DEQUEUE_DELAYED)
|
||||
finish_delayed_dequeue_entity(se);
|
||||
clear_delayed(se);
|
||||
|
||||
if (cfs_rq->nr_queued == 0) {
|
||||
update_idle_cfs_rq_clock_pelt(cfs_rq);
|
||||
@@ -6866,16 +7075,15 @@ static inline void hrtick_update(struct rq *rq)
|
||||
|
||||
static inline bool cpu_overutilized(int cpu)
|
||||
{
|
||||
unsigned long rq_util_min, rq_util_max;
|
||||
unsigned long rq_util_max;
|
||||
|
||||
if (!sched_energy_enabled())
|
||||
return false;
|
||||
|
||||
rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
|
||||
rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
|
||||
|
||||
/* Return true only if the utilization doesn't fit CPU's capacity */
|
||||
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
|
||||
return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -6913,9 +7121,15 @@ static int sched_idle_rq(struct rq *rq)
|
||||
rq->nr_running);
|
||||
}
|
||||
|
||||
static int sched_idle_cpu(int cpu)
|
||||
static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
return sched_idle_rq(cpu_rq(cpu));
|
||||
return sched_idle_rq(rq) && !task_has_idle_policy(p);
|
||||
}
|
||||
|
||||
static int choose_idle_cpu(int cpu, struct task_struct *p)
|
||||
{
|
||||
return available_idle_cpu(cpu) ||
|
||||
choose_sched_idle_rq(cpu_rq(cpu), p);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -6931,18 +7145,14 @@ requeue_delayed_entity(struct sched_entity *se)
|
||||
WARN_ON_ONCE(!se->sched_delayed);
|
||||
WARN_ON_ONCE(!se->on_rq);
|
||||
|
||||
if (sched_feat(DELAY_ZERO)) {
|
||||
update_entity_lag(cfs_rq, se);
|
||||
if (se->vlag > 0) {
|
||||
cfs_rq->nr_queued--;
|
||||
if (se != cfs_rq->curr)
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
se->vlag = 0;
|
||||
place_entity(cfs_rq, se, 0);
|
||||
if (se != cfs_rq->curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
cfs_rq->nr_queued++;
|
||||
}
|
||||
if (update_entity_lag(cfs_rq, se)) {
|
||||
cfs_rq->nr_queued--;
|
||||
if (se != cfs_rq->curr)
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
place_entity(cfs_rq, se, 0);
|
||||
if (se != cfs_rq->curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
cfs_rq->nr_queued++;
|
||||
}
|
||||
|
||||
update_load_avg(cfs_rq, se, 0);
|
||||
@@ -7475,7 +7685,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
|
||||
if (!sched_core_cookie_match(rq, p))
|
||||
continue;
|
||||
|
||||
if (sched_idle_cpu(i))
|
||||
if (choose_sched_idle_rq(rq, p))
|
||||
return i;
|
||||
|
||||
if (available_idle_cpu(i)) {
|
||||
@@ -7566,8 +7776,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
|
||||
|
||||
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
|
||||
{
|
||||
if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
|
||||
sched_cpu_cookie_match(cpu_rq(cpu), p))
|
||||
if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
|
||||
return cpu;
|
||||
|
||||
return -1;
|
||||
@@ -7640,7 +7849,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
|
||||
if (!available_idle_cpu(cpu)) {
|
||||
idle = false;
|
||||
if (*idle_cpu == -1) {
|
||||
if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
|
||||
if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
|
||||
cpumask_test_cpu(cpu, cpus)) {
|
||||
*idle_cpu = cpu;
|
||||
break;
|
||||
}
|
||||
@@ -7675,7 +7885,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
|
||||
*/
|
||||
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
|
||||
continue;
|
||||
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
|
||||
if (choose_idle_cpu(cpu, p))
|
||||
return cpu;
|
||||
}
|
||||
|
||||
@@ -7714,21 +7924,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
|
||||
{
|
||||
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
|
||||
int i, cpu, idle_cpu = -1, nr = INT_MAX;
|
||||
struct sched_domain_shared *sd_share;
|
||||
|
||||
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
|
||||
|
||||
if (sched_feat(SIS_UTIL)) {
|
||||
sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
|
||||
if (sd_share) {
|
||||
/* because !--nr is the condition to stop scan */
|
||||
nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
|
||||
/* overloaded LLC is unlikely to have idle cpu/core */
|
||||
if (nr == 1)
|
||||
return -1;
|
||||
}
|
||||
/*
|
||||
* Increment because !--nr is the condition to stop scan.
|
||||
*
|
||||
* Since "sd" is "sd_llc" for target CPU dereferenced in the
|
||||
* caller, it is safe to directly dereference "sd->shared".
|
||||
* Topology bits always ensure it assigned for "sd_llc" abd it
|
||||
* cannot disappear as long as we have a RCU protected
|
||||
* reference to one the associated "sd" here.
|
||||
*/
|
||||
nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
|
||||
/* overloaded LLC is unlikely to have idle cpu/core */
|
||||
if (nr == 1)
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr))
|
||||
return -1;
|
||||
|
||||
if (static_branch_unlikely(&sched_cluster_active)) {
|
||||
struct sched_group *sg = sd->groups;
|
||||
|
||||
@@ -7797,7 +8012,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
for_each_cpu_wrap(cpu, cpus, target) {
|
||||
unsigned long cpu_cap = capacity_of(cpu);
|
||||
|
||||
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
|
||||
if (!choose_idle_cpu(cpu, p))
|
||||
continue;
|
||||
|
||||
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
|
||||
@@ -7868,7 +8083,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
*/
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
|
||||
if (choose_idle_cpu(target, p) &&
|
||||
asym_fits_cpu(task_util, util_min, util_max, target))
|
||||
return target;
|
||||
|
||||
@@ -7876,7 +8091,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
* If the previous CPU is cache affine and idle, don't be stupid:
|
||||
*/
|
||||
if (prev != target && cpus_share_cache(prev, target) &&
|
||||
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
|
||||
choose_idle_cpu(prev, p) &&
|
||||
asym_fits_cpu(task_util, util_min, util_max, prev)) {
|
||||
|
||||
if (!static_branch_unlikely(&sched_cluster_active) ||
|
||||
@@ -7908,7 +8123,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
if (recent_used_cpu != prev &&
|
||||
recent_used_cpu != target &&
|
||||
cpus_share_cache(recent_used_cpu, target) &&
|
||||
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
|
||||
choose_idle_cpu(recent_used_cpu, p) &&
|
||||
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
|
||||
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
|
||||
|
||||
@@ -8408,10 +8623,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
struct perf_domain *pd;
|
||||
struct energy_env eenv;
|
||||
|
||||
rcu_read_lock();
|
||||
pd = rcu_dereference_all(rd->pd);
|
||||
if (!pd)
|
||||
goto unlock;
|
||||
return target;
|
||||
|
||||
/*
|
||||
* Energy-aware wake-up happens on the lowest sched_domain starting
|
||||
@@ -8421,13 +8635,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
|
||||
sd = sd->parent;
|
||||
if (!sd)
|
||||
goto unlock;
|
||||
return target;
|
||||
|
||||
target = prev_cpu;
|
||||
|
||||
sync_entity_load_avg(&p->se);
|
||||
if (!task_util_est(p) && p_util_min == 0)
|
||||
goto unlock;
|
||||
return target;
|
||||
|
||||
eenv_task_busy_time(&eenv, p, prev_cpu);
|
||||
|
||||
@@ -8522,7 +8736,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
prev_cpu);
|
||||
/* CPU utilization has changed */
|
||||
if (prev_delta < base_energy)
|
||||
goto unlock;
|
||||
return target;
|
||||
prev_delta -= base_energy;
|
||||
prev_actual_cap = cpu_actual_cap;
|
||||
best_delta = min(best_delta, prev_delta);
|
||||
@@ -8546,7 +8760,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
max_spare_cap_cpu);
|
||||
/* CPU utilization has changed */
|
||||
if (cur_delta < base_energy)
|
||||
goto unlock;
|
||||
return target;
|
||||
cur_delta -= base_energy;
|
||||
|
||||
/*
|
||||
@@ -8563,7 +8777,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
best_actual_cap = cpu_actual_cap;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if ((best_fits > prev_fits) ||
|
||||
((best_fits > 0) && (best_delta < prev_delta)) ||
|
||||
@@ -8571,11 +8784,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
||||
target = best_energy_cpu;
|
||||
|
||||
return target;
|
||||
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -8620,7 +8828,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
|
||||
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_domain(cpu, tmp) {
|
||||
/*
|
||||
* If both 'cpu' and 'prev_cpu' are part of this domain,
|
||||
@@ -8646,14 +8853,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
|
||||
break;
|
||||
}
|
||||
|
||||
if (unlikely(sd)) {
|
||||
/* Slow path */
|
||||
new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
||||
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
|
||||
/* Fast path */
|
||||
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
/* Slow path */
|
||||
if (unlikely(sd))
|
||||
return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
||||
|
||||
/* Fast path */
|
||||
if (wake_flags & WF_TTWU)
|
||||
return select_idle_sibling(p, prev_cpu, new_cpu);
|
||||
|
||||
return new_cpu;
|
||||
}
|
||||
@@ -8944,8 +9150,10 @@ pick:
|
||||
return;
|
||||
|
||||
preempt:
|
||||
if (preempt_action == PREEMPT_WAKEUP_SHORT)
|
||||
if (preempt_action == PREEMPT_WAKEUP_SHORT) {
|
||||
cancel_protect_slice(se);
|
||||
clear_buddies(cfs_rq, se);
|
||||
}
|
||||
|
||||
resched_curr_lazy(rq);
|
||||
}
|
||||
@@ -9792,32 +10000,6 @@ next:
|
||||
return detached;
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_task() -- attach the task detached by detach_task() to its new rq.
|
||||
*/
|
||||
static void attach_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
WARN_ON_ONCE(task_rq(p) != rq);
|
||||
activate_task(rq, p, ENQUEUE_NOCLOCK);
|
||||
wakeup_preempt(rq, p, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_one_task() -- attaches the task returned from detach_one_task() to
|
||||
* its new rq.
|
||||
*/
|
||||
static void attach_one_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
struct rq_flags rf;
|
||||
|
||||
rq_lock(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
attach_task(rq, p);
|
||||
rq_unlock(rq, &rf);
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
|
||||
* new rq.
|
||||
@@ -10055,6 +10237,7 @@ struct sg_lb_stats {
|
||||
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
|
||||
unsigned int group_smt_balance; /* Task on busy SMT be moved */
|
||||
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
|
||||
unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
unsigned int nr_numa_running;
|
||||
unsigned int nr_preferred_running;
|
||||
@@ -10287,6 +10470,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
|
||||
static inline bool
|
||||
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
|
||||
{
|
||||
/*
|
||||
* With EAS and uclamp, 1 CPU in the group must be overutilized to
|
||||
* consider the group overloaded.
|
||||
*/
|
||||
if (sched_energy_enabled() && !sgs->group_overutilized)
|
||||
return false;
|
||||
|
||||
if (sgs->sum_nr_running <= sgs->group_weight)
|
||||
return false;
|
||||
|
||||
@@ -10470,14 +10660,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
|
||||
* @group: sched_group whose statistics are to be updated.
|
||||
* @sgs: variable to hold the statistics for this group.
|
||||
* @sg_overloaded: sched_group is overloaded
|
||||
* @sg_overutilized: sched_group is overutilized
|
||||
*/
|
||||
static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
struct sd_lb_stats *sds,
|
||||
struct sched_group *group,
|
||||
struct sg_lb_stats *sgs,
|
||||
bool *sg_overloaded,
|
||||
bool *sg_overutilized)
|
||||
bool *sg_overloaded)
|
||||
{
|
||||
int i, nr_running, local_group, sd_flags = env->sd->flags;
|
||||
bool balancing_at_rd = !env->sd->parent;
|
||||
@@ -10499,7 +10687,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
sgs->sum_nr_running += nr_running;
|
||||
|
||||
if (cpu_overutilized(i))
|
||||
*sg_overutilized = 1;
|
||||
sgs->group_overutilized = 1;
|
||||
|
||||
/*
|
||||
* No need to call idle_cpu() if nr_running is not 0
|
||||
@@ -11075,6 +11263,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
|
||||
unsigned long sum_util)
|
||||
{
|
||||
struct sched_domain_shared *sd_share;
|
||||
struct sched_domain *sd = env->sd;
|
||||
int llc_weight, pct;
|
||||
u64 x, y, tmp;
|
||||
/*
|
||||
@@ -11088,11 +11277,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
|
||||
if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
|
||||
return;
|
||||
|
||||
llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
|
||||
if (env->sd->span_weight != llc_weight)
|
||||
return;
|
||||
|
||||
sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
|
||||
sd_share = sd->shared;
|
||||
if (!sd_share)
|
||||
return;
|
||||
|
||||
@@ -11126,10 +11311,11 @@ static void update_idle_cpu_scan(struct lb_env *env,
|
||||
*/
|
||||
/* equation [3] */
|
||||
x = sum_util;
|
||||
llc_weight = sd->span_weight;
|
||||
do_div(x, llc_weight);
|
||||
|
||||
/* equation [4] */
|
||||
pct = env->sd->imbalance_pct;
|
||||
pct = sd->imbalance_pct;
|
||||
tmp = x * x * pct * pct;
|
||||
do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
|
||||
tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
|
||||
@@ -11170,13 +11356,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
||||
update_group_capacity(env->sd, env->dst_cpu);
|
||||
}
|
||||
|
||||
update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
|
||||
update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
|
||||
|
||||
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
|
||||
sds->busiest = sg;
|
||||
sds->busiest_stat = *sgs;
|
||||
}
|
||||
|
||||
sg_overutilized |= sgs->group_overutilized;
|
||||
|
||||
/* Now, start updating sd_lb_stats */
|
||||
sds->total_load += sgs->group_load;
|
||||
sds->total_capacity += sgs->group_capacity;
|
||||
@@ -12297,7 +12485,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
|
||||
sd->newidle_success += success;
|
||||
|
||||
if (sd->newidle_call >= 1024) {
|
||||
sd->newidle_ratio = sd->newidle_success;
|
||||
u64 now = sched_clock();
|
||||
s64 delta = now - sd->newidle_stamp;
|
||||
sd->newidle_stamp = now;
|
||||
int ratio = 0;
|
||||
|
||||
if (delta < 0)
|
||||
delta = 0;
|
||||
|
||||
if (sched_feat(NI_RATE)) {
|
||||
/*
|
||||
* ratio delta freq
|
||||
*
|
||||
* 1024 - 4 s - 128 Hz
|
||||
* 512 - 2 s - 256 Hz
|
||||
* 256 - 1 s - 512 Hz
|
||||
* 128 - .5 s - 1024 Hz
|
||||
* 64 - .25 s - 2048 Hz
|
||||
*/
|
||||
ratio = delta >> 22;
|
||||
}
|
||||
|
||||
ratio += sd->newidle_success;
|
||||
|
||||
sd->newidle_ratio = min(1024, ratio);
|
||||
sd->newidle_call /= 2;
|
||||
sd->newidle_success /= 2;
|
||||
}
|
||||
@@ -12344,7 +12555,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
{
|
||||
int continue_balancing = 1;
|
||||
int cpu = rq->cpu;
|
||||
int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
|
||||
int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
|
||||
unsigned long interval;
|
||||
struct sched_domain *sd;
|
||||
/* Earliest time when we have to do rebalance again */
|
||||
@@ -12382,7 +12593,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
|
||||
* state even if we migrated tasks. Update it.
|
||||
*/
|
||||
idle = idle_cpu(cpu);
|
||||
busy = !idle && !sched_idle_cpu(cpu);
|
||||
busy = !idle && !sched_idle_rq(rq);
|
||||
}
|
||||
sd->last_balance = jiffies;
|
||||
interval = get_sd_balance_interval(sd, busy);
|
||||
@@ -12427,14 +12638,14 @@ static inline int on_null_domain(struct rq *rq)
|
||||
*/
|
||||
static inline int find_new_ilb(void)
|
||||
{
|
||||
int this_cpu = smp_processor_id();
|
||||
const struct cpumask *hk_mask;
|
||||
int ilb_cpu;
|
||||
|
||||
hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
|
||||
|
||||
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
|
||||
|
||||
if (ilb_cpu == smp_processor_id())
|
||||
if (ilb_cpu == this_cpu)
|
||||
continue;
|
||||
|
||||
if (idle_cpu(ilb_cpu))
|
||||
@@ -13004,7 +13215,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
|
||||
if (sd->flags & SD_BALANCE_NEWIDLE) {
|
||||
unsigned int weight = 1;
|
||||
|
||||
if (sched_feat(NI_RANDOM)) {
|
||||
if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
|
||||
/*
|
||||
* Throw a 1k sided dice; and only run
|
||||
* newidle_balance according to the success
|
||||
@@ -14030,7 +14241,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
|
||||
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
|
||||
}
|
||||
if (ng) {
|
||||
gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
|
||||
gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
|
||||
gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
|
||||
}
|
||||
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
|
||||
|
||||
@@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
|
||||
SCHED_FEAT(DELAY_DEQUEUE, true)
|
||||
SCHED_FEAT(DELAY_ZERO, true)
|
||||
|
||||
SCHED_FEAT(PARANOID_AVG, false)
|
||||
|
||||
/*
|
||||
* Allow wakeup-time preemption of the current task:
|
||||
*/
|
||||
@@ -131,3 +133,4 @@ SCHED_FEAT(LATENCY_WARN, false)
|
||||
* Do newidle balancing proportional to its success rate using randomization.
|
||||
*/
|
||||
SCHED_FEAT(NI_RANDOM, true)
|
||||
SCHED_FEAT(NI_RATE, true)
|
||||
|
||||
@@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
|
||||
int flags)
|
||||
{
|
||||
struct task_struct *p = NULL;
|
||||
struct rq *rq = rq_of_rt_rq(rt_rq);
|
||||
|
||||
if (!schedstat_enabled())
|
||||
return;
|
||||
|
||||
if (rt_entity_is_task(rt_se))
|
||||
if (rt_entity_is_task(rt_se)) {
|
||||
p = rt_task_of(rt_se);
|
||||
|
||||
if (p != rq->curr)
|
||||
update_stats_wait_end_rt(rt_rq, rt_se);
|
||||
}
|
||||
|
||||
if ((flags & DEQUEUE_SLEEP) && p) {
|
||||
unsigned int state;
|
||||
|
||||
@@ -1853,13 +1858,22 @@ static int find_lowest_rq(struct task_struct *task)
|
||||
|
||||
static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
||||
{
|
||||
struct task_struct *p;
|
||||
struct plist_head *head = &rq->rt.pushable_tasks;
|
||||
struct task_struct *i, *p = NULL;
|
||||
|
||||
if (!has_pushable_tasks(rq))
|
||||
return NULL;
|
||||
|
||||
p = plist_first_entry(&rq->rt.pushable_tasks,
|
||||
struct task_struct, pushable_tasks);
|
||||
plist_for_each_entry(i, head, pushable_tasks) {
|
||||
/* make sure task isn't on_cpu (possible with proxy-exec) */
|
||||
if (!task_on_cpu(rq, i)) {
|
||||
p = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
BUG_ON(rq->cpu != task_cpu(p));
|
||||
BUG_ON(task_current(rq, p));
|
||||
@@ -2652,7 +2666,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
|
||||
{
|
||||
struct rt_schedulable_data *d = data;
|
||||
struct task_group *child;
|
||||
unsigned long total, sum = 0;
|
||||
u64 total, sum = 0;
|
||||
u64 period, runtime;
|
||||
|
||||
period = ktime_to_ns(tg->rt_bandwidth.rt_period);
|
||||
@@ -2676,9 +2690,6 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
|
||||
tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
|
||||
return -EBUSY;
|
||||
|
||||
if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
|
||||
return -EBUSY;
|
||||
|
||||
total = to_ratio(period, runtime);
|
||||
|
||||
/*
|
||||
@@ -2818,19 +2829,6 @@ long sched_group_rt_period(struct task_group *tg)
|
||||
return rt_period_us;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static int sched_rt_global_constraints(void)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&rt_constraints_mutex);
|
||||
ret = __rt_schedulable(NULL, 0, 0);
|
||||
mutex_unlock(&rt_constraints_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
|
||||
{
|
||||
/* Don't accept real-time tasks when there is no way for them to run */
|
||||
@@ -2840,14 +2838,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
|
||||
return 1;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_RT_GROUP_SCHED: */
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static int sched_rt_global_constraints(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
#endif /* !CONFIG_RT_GROUP_SCHED */
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
@@ -2859,11 +2849,14 @@ static int sched_rt_global_validate(void)
|
||||
NSEC_PER_USEC > max_rt_runtime)))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
if (!rt_group_sched_enabled())
|
||||
return 0;
|
||||
|
||||
static void sched_rt_do_global(void)
|
||||
{
|
||||
scoped_guard(mutex, &rt_constraints_mutex)
|
||||
return __rt_schedulable(NULL, 0, 0);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
|
||||
@@ -2889,11 +2882,6 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
|
||||
if (ret)
|
||||
goto undo;
|
||||
|
||||
ret = sched_rt_global_constraints();
|
||||
if (ret)
|
||||
goto undo;
|
||||
|
||||
sched_rt_do_global();
|
||||
sched_dl_do_global();
|
||||
}
|
||||
if (0) {
|
||||
|
||||
@@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void);
|
||||
extern void sched_dl_do_global(void);
|
||||
extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
|
||||
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
|
||||
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
|
||||
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags);
|
||||
extern bool __checkparam_dl(const struct sched_attr *attr);
|
||||
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
|
||||
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
|
||||
@@ -684,8 +684,9 @@ struct cfs_rq {
|
||||
|
||||
s64 sum_w_vruntime;
|
||||
u64 sum_weight;
|
||||
|
||||
u64 zero_vruntime;
|
||||
unsigned int sum_shift;
|
||||
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
unsigned int forceidle_seq;
|
||||
u64 zero_vruntime_fi;
|
||||
@@ -1611,15 +1612,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
|
||||
extern bool raw_spin_rq_trylock(struct rq *rq)
|
||||
__cond_acquires(true, __rq_lockp(rq));
|
||||
|
||||
extern void raw_spin_rq_unlock(struct rq *rq)
|
||||
__releases(__rq_lockp(rq));
|
||||
|
||||
static inline void raw_spin_rq_lock(struct rq *rq)
|
||||
__acquires(__rq_lockp(rq))
|
||||
{
|
||||
raw_spin_rq_lock_nested(rq, 0);
|
||||
}
|
||||
|
||||
static inline void raw_spin_rq_unlock(struct rq *rq)
|
||||
__releases(__rq_lockp(rq))
|
||||
{
|
||||
raw_spin_unlock(rq_lockp(rq));
|
||||
}
|
||||
|
||||
static inline void raw_spin_rq_lock_irq(struct rq *rq)
|
||||
__acquires(__rq_lockp(rq))
|
||||
{
|
||||
@@ -1858,6 +1862,13 @@ static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
|
||||
static inline void scx_rq_clock_invalidate(struct rq *rq) {}
|
||||
#endif /* !CONFIG_SCHED_CLASS_EXT */
|
||||
|
||||
static inline void assert_balance_callbacks_empty(struct rq *rq)
|
||||
{
|
||||
WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) &&
|
||||
rq->balance_callback &&
|
||||
rq->balance_callback != &balance_push_callback);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lockdep annotation that avoids accidental unlocks; it's like a
|
||||
* sticky/continuous lockdep_assert_held().
|
||||
@@ -1874,7 +1885,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
|
||||
|
||||
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
|
||||
rf->clock_update_flags = 0;
|
||||
WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
|
||||
assert_balance_callbacks_empty(rq);
|
||||
}
|
||||
|
||||
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
|
||||
@@ -2854,7 +2865,7 @@ static inline void idle_set_state(struct rq *rq,
|
||||
|
||||
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
|
||||
{
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
lockdep_assert(rcu_read_lock_any_held());
|
||||
|
||||
return rq->idle_state;
|
||||
}
|
||||
@@ -2901,7 +2912,7 @@ extern void init_cfs_throttle_work(struct task_struct *p);
|
||||
#define MAX_BW_BITS (64 - BW_SHIFT)
|
||||
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
|
||||
|
||||
extern unsigned long to_ratio(u64 period, u64 runtime);
|
||||
extern u64 to_ratio(u64 period, u64 runtime);
|
||||
|
||||
extern void init_entity_runnable_average(struct sched_entity *se);
|
||||
extern void post_init_entity_util_avg(struct task_struct *p);
|
||||
@@ -3006,6 +3017,29 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
/*
|
||||
* attach_task() -- attach the task detached by detach_task() to its new rq.
|
||||
*/
|
||||
static inline void attach_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
lockdep_assert_rq_held(rq);
|
||||
|
||||
WARN_ON_ONCE(task_rq(p) != rq);
|
||||
activate_task(rq, p, ENQUEUE_NOCLOCK);
|
||||
wakeup_preempt(rq, p, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* attach_one_task() -- attaches the task returned from detach_one_task() to
|
||||
* its new rq.
|
||||
*/
|
||||
static inline void attach_one_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
guard(rq_lock)(rq);
|
||||
update_rq_clock(rq);
|
||||
attach_task(rq, p);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
# define SCHED_NR_MIGRATE_BREAK 8
|
||||
#else
|
||||
|
||||
@@ -911,10 +911,10 @@ err_size:
|
||||
return -E2BIG;
|
||||
}
|
||||
|
||||
static void get_params(struct task_struct *p, struct sched_attr *attr)
|
||||
static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
|
||||
{
|
||||
if (task_has_dl_policy(p)) {
|
||||
__getparam_dl(p, attr);
|
||||
__getparam_dl(p, attr, flags);
|
||||
} else if (task_has_rt_policy(p)) {
|
||||
attr->sched_priority = p->rt_priority;
|
||||
} else {
|
||||
@@ -980,7 +980,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||||
return -ESRCH;
|
||||
|
||||
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
|
||||
get_params(p, &attr);
|
||||
get_params(p, &attr, 0);
|
||||
|
||||
return sched_setattr(p, &attr);
|
||||
}
|
||||
@@ -1065,7 +1065,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||||
int retval;
|
||||
|
||||
if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
|
||||
usize < SCHED_ATTR_SIZE_VER0 || flags))
|
||||
usize < SCHED_ATTR_SIZE_VER0))
|
||||
return -EINVAL;
|
||||
|
||||
scoped_guard (rcu) {
|
||||
@@ -1073,6 +1073,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||||
if (!p)
|
||||
return -ESRCH;
|
||||
|
||||
if (flags) {
|
||||
if (!task_has_dl_policy(p) ||
|
||||
flags != SCHED_GETATTR_FLAG_DL_DYNAMIC)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
retval = security_task_getscheduler(p);
|
||||
if (retval)
|
||||
return retval;
|
||||
@@ -1080,7 +1086,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
|
||||
kattr.sched_policy = p->policy;
|
||||
if (p->sched_reset_on_fork)
|
||||
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
|
||||
get_params(p, &kattr);
|
||||
get_params(p, &kattr, flags);
|
||||
kattr.sched_flags &= SCHED_FLAG_ALL;
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/sched/clock.h>
|
||||
#include <linux/bsearch.h>
|
||||
#include "sched.h"
|
||||
|
||||
@@ -272,7 +273,7 @@ void rebuild_sched_domains_energy(void)
|
||||
static int sched_energy_aware_handler(const struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret, state;
|
||||
int ret;
|
||||
|
||||
if (write && !capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
@@ -288,8 +289,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
|
||||
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (!ret && write) {
|
||||
state = static_branch_unlikely(&sched_energy_present);
|
||||
if (state != sysctl_sched_energy_aware)
|
||||
if (sysctl_sched_energy_aware != sched_energy_enabled())
|
||||
rebuild_sched_domains_energy();
|
||||
}
|
||||
|
||||
@@ -387,11 +387,11 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
|
||||
|
||||
static void sched_energy_set(bool has_eas)
|
||||
{
|
||||
if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
|
||||
if (!has_eas && sched_energy_enabled()) {
|
||||
if (sched_debug())
|
||||
pr_info("%s: stopping EAS\n", __func__);
|
||||
static_branch_disable_cpuslocked(&sched_energy_present);
|
||||
} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
|
||||
} else if (has_eas && !sched_energy_enabled()) {
|
||||
if (sched_debug())
|
||||
pr_info("%s: starting EAS\n", __func__);
|
||||
static_branch_enable_cpuslocked(&sched_energy_present);
|
||||
@@ -684,6 +684,9 @@ static void update_top_cache_domain(int cpu)
|
||||
if (sd) {
|
||||
id = cpumask_first(sched_domain_span(sd));
|
||||
size = cpumask_weight(sched_domain_span(sd));
|
||||
|
||||
/* If sd_llc exists, sd_llc_shared should exist too. */
|
||||
WARN_ON_ONCE(!sd->shared);
|
||||
sds = sd->shared;
|
||||
}
|
||||
|
||||
@@ -732,6 +735,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
||||
if (sd_parent_degenerate(tmp, parent)) {
|
||||
tmp->parent = parent->parent;
|
||||
|
||||
/* Pick reference to parent->shared. */
|
||||
if (parent->shared) {
|
||||
WARN_ON_ONCE(tmp->shared);
|
||||
tmp->shared = parent->shared;
|
||||
parent->shared = NULL;
|
||||
}
|
||||
|
||||
if (parent->parent) {
|
||||
parent->parent->child = tmp;
|
||||
parent->parent->groups->flags = tmp->flags;
|
||||
@@ -781,6 +791,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
||||
}
|
||||
|
||||
struct s_data {
|
||||
struct sched_domain_shared * __percpu *sds;
|
||||
struct sched_domain * __percpu *sd;
|
||||
struct root_domain *rd;
|
||||
};
|
||||
@@ -788,6 +799,7 @@ struct s_data {
|
||||
enum s_alloc {
|
||||
sa_rootdomain,
|
||||
sa_sd,
|
||||
sa_sd_shared,
|
||||
sa_sd_storage,
|
||||
sa_none,
|
||||
};
|
||||
@@ -1534,6 +1546,9 @@ static void set_domain_attribute(struct sched_domain *sd,
|
||||
static void __sdt_free(const struct cpumask *cpu_map);
|
||||
static int __sdt_alloc(const struct cpumask *cpu_map);
|
||||
|
||||
static void __sds_free(struct s_data *d, const struct cpumask *cpu_map);
|
||||
static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map);
|
||||
|
||||
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
|
||||
const struct cpumask *cpu_map)
|
||||
{
|
||||
@@ -1545,6 +1560,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
|
||||
case sa_sd:
|
||||
free_percpu(d->sd);
|
||||
fallthrough;
|
||||
case sa_sd_shared:
|
||||
__sds_free(d, cpu_map);
|
||||
fallthrough;
|
||||
case sa_sd_storage:
|
||||
__sdt_free(cpu_map);
|
||||
fallthrough;
|
||||
@@ -1560,9 +1578,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
|
||||
|
||||
if (__sdt_alloc(cpu_map))
|
||||
return sa_sd_storage;
|
||||
if (__sds_alloc(d, cpu_map))
|
||||
return sa_sd_shared;
|
||||
d->sd = alloc_percpu(struct sched_domain *);
|
||||
if (!d->sd)
|
||||
return sa_sd_storage;
|
||||
return sa_sd_shared;
|
||||
d->rd = alloc_rootdomain();
|
||||
if (!d->rd)
|
||||
return sa_sd;
|
||||
@@ -1575,21 +1595,25 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
|
||||
* sched_group structure so that the subsequent __free_domain_allocs()
|
||||
* will not free the data we're using.
|
||||
*/
|
||||
static void claim_allocations(int cpu, struct sched_domain *sd)
|
||||
static void claim_allocations(int cpu, struct s_data *d)
|
||||
{
|
||||
struct sd_data *sdd = sd->private;
|
||||
struct sched_domain *sd;
|
||||
|
||||
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
|
||||
*per_cpu_ptr(sdd->sd, cpu) = NULL;
|
||||
if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref))
|
||||
*per_cpu_ptr(d->sds, cpu) = NULL;
|
||||
|
||||
if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
|
||||
*per_cpu_ptr(sdd->sds, cpu) = NULL;
|
||||
for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) {
|
||||
struct sd_data *sdd = sd->private;
|
||||
|
||||
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
|
||||
*per_cpu_ptr(sdd->sg, cpu) = NULL;
|
||||
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
|
||||
*per_cpu_ptr(sdd->sd, cpu) = NULL;
|
||||
|
||||
if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
|
||||
*per_cpu_ptr(sdd->sgc, cpu) = NULL;
|
||||
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
|
||||
*per_cpu_ptr(sdd->sg, cpu) = NULL;
|
||||
|
||||
if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
|
||||
*per_cpu_ptr(sdd->sgc, cpu) = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
@@ -1642,14 +1666,19 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
|
||||
int sd_id, sd_weight, sd_flags = 0;
|
||||
struct cpumask *sd_span;
|
||||
u64 now = sched_clock();
|
||||
|
||||
sd_weight = cpumask_weight(tl->mask(tl, cpu));
|
||||
sd_span = sched_domain_span(sd);
|
||||
cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
|
||||
sd_weight = cpumask_weight(sd_span);
|
||||
sd_id = cpumask_first(sd_span);
|
||||
|
||||
if (tl->sd_flags)
|
||||
sd_flags = (*tl->sd_flags)();
|
||||
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
|
||||
"wrong sd_flags in topology description\n"))
|
||||
"wrong sd_flags in topology description\n"))
|
||||
sd_flags &= TOPOLOGY_SD_FLAGS;
|
||||
sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
|
||||
|
||||
*sd = (struct sched_domain){
|
||||
.min_interval = sd_weight,
|
||||
@@ -1679,6 +1708,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
.newidle_call = 512,
|
||||
.newidle_success = 256,
|
||||
.newidle_ratio = 512,
|
||||
.newidle_stamp = now,
|
||||
|
||||
.max_newidle_lb_cost = 0,
|
||||
.last_decay_max_lb_cost = jiffies,
|
||||
@@ -1686,12 +1716,6 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
.name = tl->name,
|
||||
};
|
||||
|
||||
sd_span = sched_domain_span(sd);
|
||||
cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
|
||||
sd_id = cpumask_first(sd_span);
|
||||
|
||||
sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
|
||||
|
||||
WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
|
||||
(SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
|
||||
"CPU capacity asymmetry not supported on SMT\n");
|
||||
@@ -1727,16 +1751,6 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
sd->cache_nice_tries = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* For all levels sharing cache; connect a sched_domain_shared
|
||||
* instance.
|
||||
*/
|
||||
if (sd->flags & SD_SHARE_LLC) {
|
||||
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
|
||||
atomic_inc(&sd->shared->ref);
|
||||
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
|
||||
}
|
||||
|
||||
sd->private = sdd;
|
||||
|
||||
return sd;
|
||||
@@ -2372,10 +2386,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
|
||||
if (!sdd->sd)
|
||||
return -ENOMEM;
|
||||
|
||||
sdd->sds = alloc_percpu(struct sched_domain_shared *);
|
||||
if (!sdd->sds)
|
||||
return -ENOMEM;
|
||||
|
||||
sdd->sg = alloc_percpu(struct sched_group *);
|
||||
if (!sdd->sg)
|
||||
return -ENOMEM;
|
||||
@@ -2386,7 +2396,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
|
||||
|
||||
for_each_cpu(j, cpu_map) {
|
||||
struct sched_domain *sd;
|
||||
struct sched_domain_shared *sds;
|
||||
struct sched_group *sg;
|
||||
struct sched_group_capacity *sgc;
|
||||
|
||||
@@ -2397,13 +2406,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
|
||||
|
||||
*per_cpu_ptr(sdd->sd, j) = sd;
|
||||
|
||||
sds = kzalloc_node(sizeof(struct sched_domain_shared),
|
||||
GFP_KERNEL, cpu_to_node(j));
|
||||
if (!sds)
|
||||
return -ENOMEM;
|
||||
|
||||
*per_cpu_ptr(sdd->sds, j) = sds;
|
||||
|
||||
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
|
||||
GFP_KERNEL, cpu_to_node(j));
|
||||
if (!sg)
|
||||
@@ -2445,8 +2447,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
|
||||
kfree(*per_cpu_ptr(sdd->sd, j));
|
||||
}
|
||||
|
||||
if (sdd->sds)
|
||||
kfree(*per_cpu_ptr(sdd->sds, j));
|
||||
if (sdd->sg)
|
||||
kfree(*per_cpu_ptr(sdd->sg, j));
|
||||
if (sdd->sgc)
|
||||
@@ -2454,8 +2454,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
|
||||
}
|
||||
free_percpu(sdd->sd);
|
||||
sdd->sd = NULL;
|
||||
free_percpu(sdd->sds);
|
||||
sdd->sds = NULL;
|
||||
free_percpu(sdd->sg);
|
||||
sdd->sg = NULL;
|
||||
free_percpu(sdd->sgc);
|
||||
@@ -2463,6 +2461,42 @@ static void __sdt_free(const struct cpumask *cpu_map)
|
||||
}
|
||||
}
|
||||
|
||||
static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
|
||||
{
|
||||
int j;
|
||||
|
||||
d->sds = alloc_percpu(struct sched_domain_shared *);
|
||||
if (!d->sds)
|
||||
return -ENOMEM;
|
||||
|
||||
for_each_cpu(j, cpu_map) {
|
||||
struct sched_domain_shared *sds;
|
||||
|
||||
sds = kzalloc_node(sizeof(struct sched_domain_shared),
|
||||
GFP_KERNEL, cpu_to_node(j));
|
||||
if (!sds)
|
||||
return -ENOMEM;
|
||||
|
||||
*per_cpu_ptr(d->sds, j) = sds;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __sds_free(struct s_data *d, const struct cpumask *cpu_map)
|
||||
{
|
||||
int j;
|
||||
|
||||
if (!d->sds)
|
||||
return;
|
||||
|
||||
for_each_cpu(j, cpu_map)
|
||||
kfree(*per_cpu_ptr(d->sds, j));
|
||||
|
||||
free_percpu(d->sds);
|
||||
d->sds = NULL;
|
||||
}
|
||||
|
||||
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
|
||||
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
|
||||
struct sched_domain *child, int cpu)
|
||||
@@ -2548,6 +2582,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate an allowed NUMA imbalance such that LLCs do not get
|
||||
* imbalanced.
|
||||
*/
|
||||
static void adjust_numa_imbalance(struct sched_domain *sd_llc)
|
||||
{
|
||||
struct sched_domain *parent;
|
||||
unsigned int imb_span = 1;
|
||||
unsigned int imb = 0;
|
||||
unsigned int nr_llcs;
|
||||
|
||||
WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
|
||||
WARN_ON(!sd_llc->parent);
|
||||
|
||||
/*
|
||||
* For a single LLC per node, allow an
|
||||
* imbalance up to 12.5% of the node. This is
|
||||
* arbitrary cutoff based two factors -- SMT and
|
||||
* memory channels. For SMT-2, the intent is to
|
||||
* avoid premature sharing of HT resources but
|
||||
* SMT-4 or SMT-8 *may* benefit from a different
|
||||
* cutoff. For memory channels, this is a very
|
||||
* rough estimate of how many channels may be
|
||||
* active and is based on recent CPUs with
|
||||
* many cores.
|
||||
*
|
||||
* For multiple LLCs, allow an imbalance
|
||||
* until multiple tasks would share an LLC
|
||||
* on one node while LLCs on another node
|
||||
* remain idle. This assumes that there are
|
||||
* enough logical CPUs per LLC to avoid SMT
|
||||
* factors and that there is a correlation
|
||||
* between LLCs and memory channels.
|
||||
*/
|
||||
nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
|
||||
if (nr_llcs == 1)
|
||||
imb = sd_llc->parent->span_weight >> 3;
|
||||
else
|
||||
imb = nr_llcs;
|
||||
|
||||
imb = max(1U, imb);
|
||||
sd_llc->parent->imb_numa_nr = imb;
|
||||
|
||||
/*
|
||||
* Set span based on the first NUMA domain.
|
||||
*
|
||||
* NUMA systems always add a NODE domain before
|
||||
* iterating the NUMA domains. Since this is before
|
||||
* degeneration, start from sd_llc's parent's
|
||||
* parent which is the lowest an SD_NUMA domain can
|
||||
* be relative to sd_llc.
|
||||
*/
|
||||
parent = sd_llc->parent->parent;
|
||||
while (parent && !(parent->flags & SD_NUMA))
|
||||
parent = parent->parent;
|
||||
|
||||
imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
|
||||
|
||||
/* Update the upper remainder of the topology */
|
||||
parent = sd_llc->parent;
|
||||
while (parent) {
|
||||
int factor = max(1U, (parent->span_weight / imb_span));
|
||||
|
||||
parent->imb_numa_nr = imb * factor;
|
||||
parent = parent->parent;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Build sched domains for a given set of CPUs and attach the sched domains
|
||||
* to the individual CPUs
|
||||
@@ -2605,61 +2707,28 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate an allowed NUMA imbalance such that LLCs do not get
|
||||
* imbalanced.
|
||||
*/
|
||||
for_each_cpu(i, cpu_map) {
|
||||
unsigned int imb = 0;
|
||||
unsigned int imb_span = 1;
|
||||
sd = *per_cpu_ptr(d.sd, i);
|
||||
if (!sd)
|
||||
continue;
|
||||
|
||||
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
|
||||
struct sched_domain *child = sd->child;
|
||||
/* First, find the topmost SD_SHARE_LLC domain */
|
||||
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
|
||||
sd = sd->parent;
|
||||
|
||||
if (!(sd->flags & SD_SHARE_LLC) && child &&
|
||||
(child->flags & SD_SHARE_LLC)) {
|
||||
struct sched_domain __rcu *top_p;
|
||||
unsigned int nr_llcs;
|
||||
if (sd->flags & SD_SHARE_LLC) {
|
||||
int sd_id = cpumask_first(sched_domain_span(sd));
|
||||
|
||||
/*
|
||||
* For a single LLC per node, allow an
|
||||
* imbalance up to 12.5% of the node. This is
|
||||
* arbitrary cutoff based two factors -- SMT and
|
||||
* memory channels. For SMT-2, the intent is to
|
||||
* avoid premature sharing of HT resources but
|
||||
* SMT-4 or SMT-8 *may* benefit from a different
|
||||
* cutoff. For memory channels, this is a very
|
||||
* rough estimate of how many channels may be
|
||||
* active and is based on recent CPUs with
|
||||
* many cores.
|
||||
*
|
||||
* For multiple LLCs, allow an imbalance
|
||||
* until multiple tasks would share an LLC
|
||||
* on one node while LLCs on another node
|
||||
* remain idle. This assumes that there are
|
||||
* enough logical CPUs per LLC to avoid SMT
|
||||
* factors and that there is a correlation
|
||||
* between LLCs and memory channels.
|
||||
*/
|
||||
nr_llcs = sd->span_weight / child->span_weight;
|
||||
if (nr_llcs == 1)
|
||||
imb = sd->span_weight >> 3;
|
||||
else
|
||||
imb = nr_llcs;
|
||||
imb = max(1U, imb);
|
||||
sd->imb_numa_nr = imb;
|
||||
sd->shared = *per_cpu_ptr(d.sds, sd_id);
|
||||
atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
|
||||
atomic_inc(&sd->shared->ref);
|
||||
|
||||
/* Set span based on the first NUMA domain. */
|
||||
top_p = sd->parent;
|
||||
while (top_p && !(top_p->flags & SD_NUMA)) {
|
||||
top_p = top_p->parent;
|
||||
}
|
||||
imb_span = top_p ? top_p->span_weight : sd->span_weight;
|
||||
} else {
|
||||
int factor = max(1U, (sd->span_weight / imb_span));
|
||||
|
||||
sd->imb_numa_nr = imb * factor;
|
||||
}
|
||||
/*
|
||||
* In presence of higher domains, adjust the
|
||||
* NUMA imbalance stats for the hierarchy.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_NUMA) && sd->parent)
|
||||
adjust_numa_imbalance(sd);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2668,10 +2737,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
||||
if (!cpumask_test_cpu(i, cpu_map))
|
||||
continue;
|
||||
|
||||
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
|
||||
claim_allocations(i, sd);
|
||||
claim_allocations(i, &d);
|
||||
|
||||
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent)
|
||||
init_sched_groups_capacity(i, sd);
|
||||
}
|
||||
}
|
||||
|
||||
/* Attach the domains */
|
||||
|
||||
Reference in New Issue
Block a user