Merge tag 'kthread-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks

Pull kthread updates from Frederic Weisbecker:
 "The kthread code provides an infrastructure which manages the
  preferred affinity of unbound kthreads (node or custom cpumask)
  against housekeeping (CPU isolation) constraints and CPU hotplug
  events.

  One crucial missing piece is the handling of cpuset: when an isolated
  partition is created, deleted, or its CPUs updated, all the unbound
  kthreads in the top cpuset become indifferently affine to _all_ the
  non-isolated CPUs, possibly breaking their preferred affinity along
  the way.

  Solve this with performing the kthreads affinity update from cpuset to
  the kthreads consolidated relevant code instead so that preferred
  affinities are honoured and applied against the updated cpuset
  isolated partitions.

  The dispatch of the new isolated cpumasks to timers, workqueues and
  kthreads is performed by housekeeping, as per the nice Tejun's
  suggestion.

  As a welcome side effect, HK_TYPE_DOMAIN then integrates both the set
  from boot defined domain isolation (through isolcpus=) and cpuset
  isolated partitions. Housekeeping cpumasks are now modifiable with a
  specific RCU based synchronization. A big step toward making
  nohz_full= also mutable through cpuset in the future"

* tag 'kthread-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks: (33 commits)
  doc: Add housekeeping documentation
  kthread: Document kthread_affine_preferred()
  kthread: Comment on the purpose and placement of kthread_affine_node() call
  kthread: Honour kthreads preferred affinity after cpuset changes
  sched/arm64: Move fallback task cpumask to HK_TYPE_DOMAIN
  sched: Switch the fallback task allowed cpumask to HK_TYPE_DOMAIN
  kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management
  kthread: Include kthreadd to the managed affinity list
  kthread: Include unbound kthreads in the managed affinity list
  kthread: Refine naming of affinity related fields
  PCI: Remove superfluous HK_TYPE_WQ check
  sched/isolation: Remove HK_TYPE_TICK test from cpu_is_isolated()
  cpuset: Remove cpuset_cpu_is_isolated()
  timers/migration: Remove superfluous cpuset isolation test
  cpuset: Propagate cpuset isolation update to timers through housekeeping
  cpuset: Propagate cpuset isolation update to workqueue through housekeeping
  PCI: Flush PCI probe workqueue on cpuset isolated partition change
  sched/isolation: Flush vmstat workqueues on cpuset isolated partition change
  sched/isolation: Flush memcg workqueues on cpuset isolated partition change
  cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset
  ...
This commit is contained in:
Linus Torvalds
2026-02-09 19:57:30 -08:00
28 changed files with 554 additions and 222 deletions

View File

@@ -35,8 +35,8 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
static LIST_HEAD(kthreads_hotplug);
static DEFINE_MUTEX(kthreads_hotplug_lock);
static LIST_HEAD(kthread_affinity_list);
static DEFINE_MUTEX(kthread_affinity_lock);
struct kthread_create_info
{
@@ -69,7 +69,7 @@ struct kthread {
/* To store the full name if task comm is truncated. */
char *full_name;
struct task_struct *task;
struct list_head hotplug_node;
struct list_head affinity_node;
struct cpumask *preferred_affinity;
};
@@ -128,7 +128,7 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
INIT_LIST_HEAD(&kthread->hotplug_node);
INIT_LIST_HEAD(&kthread->affinity_node);
p->vfork_done = &kthread->exited;
kthread->task = p;
@@ -323,10 +323,10 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
if (!list_empty(&kthread->hotplug_node)) {
mutex_lock(&kthreads_hotplug_lock);
list_del(&kthread->hotplug_node);
mutex_unlock(&kthreads_hotplug_lock);
if (!list_empty(&kthread->affinity_node)) {
mutex_lock(&kthread_affinity_lock);
list_del(&kthread->affinity_node);
mutex_unlock(&kthread_affinity_lock);
if (kthread->preferred_affinity) {
kfree(kthread->preferred_affinity);
@@ -362,17 +362,20 @@ static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpum
{
const struct cpumask *pref;
guard(rcu)();
if (kthread->preferred_affinity) {
pref = kthread->preferred_affinity;
} else {
if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
return;
pref = cpumask_of_node(kthread->node);
if (kthread->node == NUMA_NO_NODE)
pref = housekeeping_cpumask(HK_TYPE_DOMAIN);
else
pref = cpumask_of_node(kthread->node);
}
cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (cpumask_empty(cpumask))
cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
}
static void kthread_affine_node(void)
@@ -380,32 +383,29 @@ static void kthread_affine_node(void)
struct kthread *kthread = to_kthread(current);
cpumask_var_t affinity;
WARN_ON_ONCE(kthread_is_per_cpu(current));
if (WARN_ON_ONCE(kthread_is_per_cpu(current)))
return;
if (kthread->node == NUMA_NO_NODE) {
housekeeping_affine(current, HK_TYPE_KTHREAD);
} else {
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
WARN_ON_ONCE(1);
return;
}
mutex_lock(&kthreads_hotplug_lock);
WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
/*
* The node cpumask is racy when read from kthread() but:
* - a racing CPU going down will either fail on the subsequent
* call to set_cpus_allowed_ptr() or be migrated to housekeepers
* afterwards by the scheduler.
* - a racing CPU going up will be handled by kthreads_online_cpu()
*/
kthread_fetch_affinity(kthread, affinity);
set_cpus_allowed_ptr(current, affinity);
mutex_unlock(&kthreads_hotplug_lock);
free_cpumask_var(affinity);
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
WARN_ON_ONCE(1);
return;
}
mutex_lock(&kthread_affinity_lock);
WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
/*
* The node cpumask is racy when read from kthread() but:
* - a racing CPU going down will either fail on the subsequent
* call to set_cpus_allowed_ptr() or be migrated to housekeepers
* afterwards by the scheduler.
* - a racing CPU going up will be handled by kthreads_online_cpu()
*/
kthread_fetch_affinity(kthread, affinity);
set_cpus_allowed_ptr(current, affinity);
mutex_unlock(&kthread_affinity_lock);
free_cpumask_var(affinity);
}
static int kthread(void *_create)
@@ -453,6 +453,10 @@ static int kthread(void *_create)
self->started = 1;
/*
* Apply default node affinity if no call to kthread_bind[_mask]() nor
* kthread_affine_preferred() was issued before the first wake-up.
*/
if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
kthread_affine_node();
@@ -820,12 +824,13 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, comm);
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
cgroup_init_kthreadd();
kthread_affine_node();
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
@@ -851,6 +856,18 @@ int kthreadd(void *unused)
return 0;
}
/**
* kthread_affine_preferred - Define a kthread's preferred affinity
* @p: thread created by kthread_create().
* @mask: preferred mask of CPUs (might not be online, must be possible) for @p
* to run on.
*
* Similar to kthread_bind_mask() except that the affinity is not a requirement
* but rather a preference that can be constrained by CPU isolation or CPU hotplug.
* Must be called before the first wakeup of the kthread.
*
* Returns 0 if the affinity has been applied.
*/
int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
struct kthread *kthread = to_kthread(p);
@@ -873,16 +890,16 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
goto out;
}
mutex_lock(&kthreads_hotplug_lock);
mutex_lock(&kthread_affinity_lock);
cpumask_copy(kthread->preferred_affinity, mask);
WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
WARN_ON_ONCE(!list_empty(&kthread->affinity_node));
list_add_tail(&kthread->affinity_node, &kthread_affinity_list);
kthread_fetch_affinity(kthread, affinity);
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
set_cpus_allowed_force(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
mutex_unlock(&kthread_affinity_lock);
out:
free_cpumask_var(affinity);
@@ -890,6 +907,67 @@ out:
}
EXPORT_SYMBOL_GPL(kthread_affine_preferred);
static int kthreads_update_affinity(bool force)
{
cpumask_var_t affinity;
struct kthread *k;
int ret;
guard(mutex)(&kthread_affinity_lock);
if (list_empty(&kthread_affinity_list))
return 0;
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
return -ENOMEM;
ret = 0;
list_for_each_entry(k, &kthread_affinity_list, affinity_node) {
if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
kthread_is_per_cpu(k->task))) {
ret = -EINVAL;
continue;
}
/*
* Unbound kthreads without preferred affinity are already affine
* to housekeeping, whether those CPUs are online or not. So no need
* to handle newly online CPUs for them. However housekeeping changes
* have to be applied.
*
* But kthreads with a preferred affinity or node are different:
* if none of their preferred CPUs are online and part of
* housekeeping at the same time, they must be affine to housekeeping.
* But as soon as one of their preferred CPU becomes online, they must
* be affine to them.
*/
if (force || k->preferred_affinity || k->node != NUMA_NO_NODE) {
kthread_fetch_affinity(k, affinity);
set_cpus_allowed_ptr(k->task, affinity);
}
}
free_cpumask_var(affinity);
return ret;
}
/**
* kthreads_update_housekeeping - Update kthreads affinity on cpuset change
*
* When cpuset changes a partition type to/from "isolated" or updates related
* cpumasks, propagate the housekeeping cpumask change to preferred kthreads
* affinity.
*
* Returns 0 if successful, -ENOMEM if temporary mask couldn't
* be allocated or -EINVAL in case of internal error.
*/
int kthreads_update_housekeeping(void)
{
return kthreads_update_affinity(true);
}
/*
* Re-affine kthreads according to their preferences
* and the newly online CPU. The CPU down part is handled
@@ -899,33 +977,7 @@ EXPORT_SYMBOL_GPL(kthread_affine_preferred);
*/
static int kthreads_online_cpu(unsigned int cpu)
{
cpumask_var_t affinity;
struct kthread *k;
int ret;
guard(mutex)(&kthreads_hotplug_lock);
if (list_empty(&kthreads_hotplug))
return 0;
if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
return -ENOMEM;
ret = 0;
list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
kthread_is_per_cpu(k->task))) {
ret = -EINVAL;
continue;
}
kthread_fetch_affinity(k, affinity);
set_cpus_allowed_ptr(k->task, affinity);
}
free_cpumask_var(affinity);
return ret;
return kthreads_update_affinity(false);
}
static int kthreads_init(void)