Merge branch 'sched/urgent' into sched/core, to pick up fixes and refresh the branch

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2023-10-07 11:32:24 +02:00
950 changed files with 8580 additions and 5108 deletions

View File

@@ -8501,7 +8501,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
tname = btf_name_by_offset(btf, walk_type->name_off);
ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
if (ret < 0)
if (ret >= sizeof(safe_tname))
return false;
safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));

View File

@@ -785,7 +785,8 @@ found:
* to descendants
* @cgrp: The cgroup which descendants to traverse
* @link: A link for which to replace BPF program
* @type: Type of attach operation
* @new_prog: &struct bpf_prog for the target BPF program with its refcnt
* incremented
*
* Must be called with cgroup_mutex held.
*/
@@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
* @type: The type of program to be executed
* @atype: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
* @type: The type of program to be executed
* @atype: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1449,7 +1450,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
* @type: The type of program to be executed
* @atype: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
@@ -1496,7 +1497,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
* @type: The type of program to be executed
* @atype: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1670,7 +1671,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
* @type: type of program to be executed
* @atype: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
* can allow or deny such access.

View File

@@ -459,8 +459,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
* Typical case will be between 11K and 116K closer to 11K.
* bpf progs can and should share bpf_mem_cache when possible.
*/
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
static void init_refill_work(struct bpf_mem_cache *c)
{
init_irq_work(&c->refill_work, bpf_mem_refill);
if (c->unit_size <= 256) {
@@ -476,7 +475,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
c->high_watermark = max(96 * 256 / c->unit_size, 3);
}
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
}
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
{
/* To avoid consuming memory assume that 1st run of bpf
* prog won't be doing more than 4 map_update_elem from
* irq disabled region
@@ -484,6 +486,31 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
}
static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
{
struct llist_node *first;
unsigned int obj_size;
/* For per-cpu allocator, the size of free objects in free list doesn't
* match with unit_size and now there is no way to get the size of
* per-cpu pointer saved in free object, so just skip the checking.
*/
if (c->percpu_size)
return 0;
first = c->free_llist.first;
if (!first)
return 0;
obj_size = ksize(first);
if (obj_size != c->unit_size) {
WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
idx, obj_size, c->unit_size);
return -EINVAL;
}
return 0;
}
/* When size != 0 bpf_mem_cache for each cpu.
* This is typical bpf hash map use case when all elements have equal size.
*
@@ -494,10 +521,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
int cpu, i, err, unit_size, percpu_size = 0;
struct bpf_mem_caches *cc, __percpu *pcc;
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -521,6 +548,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->objcg = objcg;
c->percpu_size = percpu_size;
c->tgt = c;
init_refill_work(c);
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
@@ -534,6 +562,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
err = 0;
#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
#endif
@@ -544,11 +573,30 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->unit_size = sizes[i];
c->objcg = objcg;
c->tgt = c;
init_refill_work(c);
/* Another bpf_mem_cache will be used when allocating
* c->unit_size in bpf_mem_alloc(), so doesn't prefill
* for the bpf_mem_cache because these free objects will
* never be used.
*/
if (i != bpf_mem_cache_idx(c->unit_size))
continue;
prefill_mem_cache(c, cpu);
err = check_obj_size(c, i);
if (err)
goto out;
}
}
out:
ma->caches = pcc;
return 0;
/* refill_work is either zeroed or initialized, so it is safe to
* call irq_work_sync().
*/
if (err)
bpf_mem_alloc_destroy(ma);
return err;
}
static void drain_mem_cache(struct bpf_mem_cache *c)
@@ -916,3 +964,41 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
static __init int bpf_mem_cache_adjust_size(void)
{
unsigned int size, index;
/* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
* up-to 256-bytes.
*/
size = KMALLOC_MIN_SIZE;
if (size <= 192)
index = size_index[(size - 1) / 8];
else
index = fls(size - 1) - 1;
for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
size_index[(size - 1) / 8] = index;
/* The minimal alignment is 64-bytes, so disable 96-bytes cache and
* use 128-bytes cache instead.
*/
if (KMALLOC_MIN_SIZE >= 64) {
index = size_index[(128 - 1) / 8];
for (size = 64 + 8; size <= 96; size += 8)
size_index[(size - 1) / 8] = index;
}
/* The minimal alignment is 128-bytes, so disable 192-bytes cache and
* use 256-bytes cache instead.
*/
if (KMALLOC_MIN_SIZE >= 128) {
index = fls(256 - 1) - 1;
for (size = 128 + 8; size <= 192; size += 8)
size_index[(size - 1) / 8] = index;
}
return 0;
}
subsys_initcall(bpf_mem_cache_adjust_size);

View File

@@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n
offload->netdev = netdev;
ondev = bpf_offload_find_netdev(offload->netdev);
/* When program is offloaded require presence of "true"
* bpf_offload_netdev, avoid the one created for !ondev case below.
*/
if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
err = -EINVAL;
goto err_free;
}
if (!ondev) {
if (bpf_prog_is_offloaded(prog->aux)) {
err = -EINVAL;
goto err_free;
}
/* When only binding to the device, explicitly
* create an entry in the hashtable.
*/

View File

@@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
int err = 0;
void *ptr;
raw_spin_lock_irqsave(&qs->lock, flags);
if (in_nmi()) {
if (!raw_spin_trylock_irqsave(&qs->lock, flags))
return -EBUSY;
} else {
raw_spin_lock_irqsave(&qs->lock, flags);
}
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
raw_spin_lock_irqsave(&qs->lock, flags);
if (in_nmi()) {
if (!raw_spin_trylock_irqsave(&qs->lock, flags))
return -EBUSY;
} else {
raw_spin_lock_irqsave(&qs->lock, flags);
}
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
raw_spin_lock_irqsave(&qs->lock, irq_flags);
if (in_nmi()) {
if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
return -EBUSY;
} else {
raw_spin_lock_irqsave(&qs->lock, irq_flags);
}
if (queue_stack_map_is_full(qs)) {
if (!replace) {

View File

@@ -739,6 +739,17 @@ subsys_initcall(crash_notes_memory_init);
#undef pr_fmt
#define pr_fmt(fmt) "crash hp: " fmt
/*
* Different than kexec/kdump loading/unloading/jumping/shrinking which
* usually rarely happen, there will be many crash hotplug events notified
* during one short period, e.g one memory board is hot added and memory
* regions are online. So mutex lock __crash_hotplug_lock is used to
* serialize the crash hotplug handling specifically.
*/
DEFINE_MUTEX(__crash_hotplug_lock);
#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
/*
* This routine utilized when the crash_hotplug sysfs node is read.
* It reflects the kernel's ability/permission to update the crash
@@ -748,9 +759,11 @@ int crash_check_update_elfcorehdr(void)
{
int rc = 0;
crash_hotplug_lock();
/* Obtain lock while reading crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
crash_hotplug_unlock();
return 0;
}
if (kexec_crash_image) {
@@ -761,6 +774,7 @@ int crash_check_update_elfcorehdr(void)
}
/* Release lock now that update complete */
kexec_unlock();
crash_hotplug_unlock();
return rc;
}
@@ -783,9 +797,11 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
{
struct kimage *image;
crash_hotplug_lock();
/* Obtain lock while changing crash information */
if (!kexec_trylock()) {
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
crash_hotplug_unlock();
return;
}
@@ -852,6 +868,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
out:
/* Release lock now that update complete */
kexec_unlock();
crash_hotplug_unlock();
}
static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)

View File

@@ -399,14 +399,13 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
}
mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
default_nareas), SMP_CACHE_BYTES);
nareas), SMP_CACHE_BYTES);
if (!mem->areas) {
pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
return;
}
swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false,
default_nareas);
swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
add_mem_pool(&io_tlb_default_mem, mem);
if (flags & SWIOTLB_VERBOSE)
@@ -729,9 +728,6 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
}
add_mem_pool(mem, pool);
/* Pairs with smp_rmb() in is_swiotlb_buffer(). */
smp_wmb();
}
/**
@@ -1152,9 +1148,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
found:
dev->dma_uses_io_tlb = true;
/* Pairs with smp_rmb() in is_swiotlb_buffer() */
smp_wmb();
WRITE_ONCE(dev->dma_uses_io_tlb, true);
/*
* The general barrier orders reads and writes against a presumed store
* of the SWIOTLB buffer address by a device driver (to a driver private
* data structure). It serves two purposes.
*
* First, the store to dev->dma_uses_io_tlb must be ordered before the
* presumed store. This guarantees that the returned buffer address
* cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
*
* Second, the load from mem->pools must be ordered before the same
* presumed store. This guarantees that the returned buffer address
* cannot be observed by another CPU before an update of the RCU list
* that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
* atomicity).
*
* See also the comment in is_swiotlb_buffer().
*/
smp_mb();
*retpool = pool;
return index;

View File

@@ -697,6 +697,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
if (!fmt) {
__warn(file, line, __builtin_return_address(0), taint,
NULL, NULL);
warn_rcu_exit(rcu);
return;
}

View File

@@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags)
}
/**
* pidfd_open() - Open new pid file descriptor.
* sys_pidfd_open() - Open new pid file descriptor.
*
* @pid: pid for which to retrieve a pidfd
* @flags: flags to pass

View File

@@ -786,9 +786,9 @@ int hibernate(void)
unlock_device_hotplug();
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
error = swsusp_check(snapshot_test);
error = swsusp_check(false);
if (!error)
error = load_image_and_restore(snapshot_test);
error = load_image_and_restore(false);
}
thaw_processes();
@@ -945,14 +945,14 @@ static int software_resume(void)
pm_pr_dbg("Looking for hibernation image.\n");
mutex_lock(&system_transition_mutex);
error = swsusp_check(false);
error = swsusp_check(true);
if (error)
goto Unlock;
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
swsusp_close(false);
swsusp_close(true);
goto Unlock;
}
@@ -973,7 +973,7 @@ static int software_resume(void)
goto Close_Finish;
}
error = load_image_and_restore(false);
error = load_image_and_restore(true);
thaw_processes();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
@@ -987,7 +987,7 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
swsusp_close(false);
swsusp_close(true);
goto Finish;
}

View File

@@ -168,11 +168,11 @@ extern int swsusp_swap_in_use(void);
#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
int swsusp_check(bool snapshot_test);
int swsusp_check(bool exclusive);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
void swsusp_close(bool snapshot_test);
void swsusp_close(bool exclusive);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
#endif

View File

@@ -1513,12 +1513,13 @@ end:
static void *swsusp_holder;
/**
* swsusp_check - Check for swsusp signature in the resume device
* swsusp_check - Check for swsusp signature in the resume device
* @exclusive: Open the resume device exclusively.
*/
int swsusp_check(bool snapshot_test)
int swsusp_check(bool exclusive)
{
void *holder = snapshot_test ? &swsusp_holder : NULL;
void *holder = exclusive ? &swsusp_holder : NULL;
int error;
hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
@@ -1563,17 +1564,18 @@ put:
}
/**
* swsusp_close - close swap device.
* swsusp_close - close swap device.
* @exclusive: Close the resume device which is exclusively opened.
*/
void swsusp_close(bool snapshot_test)
void swsusp_close(bool exclusive)
{
if (IS_ERR(hib_resume_bdev)) {
pr_debug("Image device not initialised\n");
return;
}
blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL);
blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL);
}
/**

View File

@@ -9185,7 +9185,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
* PF_KTHREAD should already be set at this point; regardless, make it
* look like a proper per-CPU kthread.
*/
idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
#ifdef CONFIG_SMP

View File

@@ -350,7 +350,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
* Except when the rq is capped by uclamp_max.
*/
if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
!sg_policy->need_freq_update) {
next_f = sg_policy->next_freq;
/* Restore cached freq as next_freq has changed */

View File

@@ -101,6 +101,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/*
* We have to ensure that we have at least one bit

View File

@@ -651,6 +651,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
}
/*
* Specifically: avg_runtime() + 0 must result in entity_eligible() := true
* For this to be so, the result of this function must have a left bias.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
@@ -664,8 +668,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
load += weight;
}
if (load)
if (load) {
/* sign flips effective floor / ceil */
if (avg < 0)
avg -= (load - 1);
avg = div_s64(avg, load);
}
return cfs_rq->min_vruntime + avg;
}
@@ -4889,10 +4897,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vslice = calc_delta_fair(se->slice, se);
u64 vruntime = avg_vruntime(cfs_rq);
u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);
/*
* Due to how V is constructed as the weighted average of entities,
* adding tasks with positive lag, or removing tasks with negative lag
@@ -6599,6 +6609,7 @@ dequeue_throttle:
/* Working cpumask for: load_balance, load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -9563,7 +9574,7 @@ static inline long sibling_imbalance(struct lb_env *env,
imbalance /= ncores_local + ncores_busiest;
/* Take advantage of resource in an empty sched group */
if (imbalance == 0 && local->sum_nr_running == 0 &&
if (imbalance <= 1 && local->sum_nr_running == 0 &&
busiest->sum_nr_running > 1)
imbalance = 2;
@@ -9751,6 +9762,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
break;
case group_smt_balance:
/*
* Check if we have spare CPUs on either SMT group to
* choose has spare or fully busy handling.
*/
if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
goto has_spare;
fallthrough;
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9790,6 +9810,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
else
return true;
}
has_spare:
/*
* Select not overloaded group with lowest number of idle cpus
@@ -10901,6 +10922,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
int cpu, idle_smt = -1;
@@ -10924,8 +10946,9 @@ static int should_we_balance(struct lb_env *env)
return 1;
}
cpumask_copy(swb_cpus, group_balance_mask(sg));
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
for_each_cpu_and(cpu, swb_cpus, env->cpus) {
if (!idle_cpu(cpu))
continue;
@@ -10937,6 +10960,14 @@ static int should_we_balance(struct lb_env *env)
if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
if (idle_smt == -1)
idle_smt = cpu;
/*
* If the core is not idle, and first SMT sibling which is
* idle has been found, then its not needed to check other
* SMT siblings for idleness:
*/
#ifdef CONFIG_SCHED_SMT
cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
#endif
continue;
}
@@ -12902,6 +12933,8 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));

View File

@@ -373,6 +373,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
current->flags |= PF_IDLE;
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)

View File

@@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
* task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @match: match function to call
* @data: data to be passed in to match function
*
* RETURNS:
* The found work or NULL if not found.

View File

@@ -2853,6 +2853,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
return arr.mods_cnt;
}
static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
{
u32 i;
for (i = 0; i < cnt; i++) {
if (!within_error_injection_list(addrs[i]))
return -EINVAL;
}
return 0;
}
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_kprobe_multi_link *link = NULL;
@@ -2930,6 +2941,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error;
}
if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
err = -EINVAL;
goto error;
}
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
err = -ENOMEM;
@@ -3207,8 +3223,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
rcu_read_unlock();
if (!task)
if (!task) {
err = -ESRCH;
goto error_path_put;
}
}
err = -ENOMEM;

View File

@@ -354,6 +354,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
local_set(&bpage->commit, 0);
}
static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}
static void free_buffer_page(struct buffer_page *bpage)
{
free_page((unsigned long)bpage->page);
@@ -1132,6 +1137,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
if (full) {
poll_wait(filp, &work->full_waiters, poll_table);
work->full_waiters_pending = true;
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
} else {
poll_wait(filp, &work->waiters, poll_table);
work->waiters_pending = true;
@@ -2003,7 +2011,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
* Increment overrun to account for the lost events.
*/
local_add(page_entries, &cpu_buffer->overrun);
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
}
@@ -2198,6 +2206,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
err = -ENOMEM;
goto out_err;
}
cond_resched();
}
cpus_read_lock();
@@ -2365,11 +2375,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->read);
}
static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}
static struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
@@ -2388,6 +2393,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
commit = rb_page_commit(iter_head_page);
smp_rmb();
/* An event needs to be at least 8 bytes in size */
if (iter->head > commit - 8)
goto reset;
event = __rb_page_index(iter_head_page, iter->head);
length = rb_event_length(event);
@@ -2510,7 +2520,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the counters.
*/
local_add(entries, &cpu_buffer->overrun);
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
/*
@@ -2653,9 +2663,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail);
/* account for padding bytes */
local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
/*
* Save the original length to the meta data.
* This will be used by the reader to add lost event
@@ -2669,7 +2676,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* write counter enough to allow another writer to slip
* in on this page.
* We put in a discarded commit instead, to make sure
* that this space is not used again.
* that this space is not used again, and this space will
* not be accounted into 'entries_bytes'.
*
* If we are less than the minimum size, we don't need to
* worry about it.
@@ -2694,6 +2702,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* time delta must be non zero */
event->time_delta = 1;
/* account for padding bytes */
local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
@@ -4208,7 +4219,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
/**
* ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
* ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
@@ -4716,6 +4727,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
length = rb_event_length(event);
cpu_buffer->reader_page->read += length;
cpu_buffer->read_bytes += length;
}
static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -5809,7 +5821,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
} else {
/* update the entry counter */
cpu_buffer->read += rb_page_entries(reader);
cpu_buffer->read_bytes += BUF_PAGE_SIZE;
cpu_buffer->read_bytes += rb_page_commit(reader);
/* swap the pages */
rb_init_page(bpage);

View File

@@ -1772,7 +1772,7 @@ static void trace_create_maxlat_file(struct trace_array *tr,
init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
tr->d_max_latency = trace_create_file("tracing_max_latency",
TRACE_MODE_WRITE,
d_tracer, &tr->max_latency,
d_tracer, tr,
&tracing_max_lat_fops);
}
@@ -1805,7 +1805,7 @@ void latency_fsnotify(struct trace_array *tr)
#define trace_create_maxlat_file(tr, d_tracer) \
trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \
d_tracer, &tr->max_latency, &tracing_max_lat_fops)
d_tracer, tr, &tracing_max_lat_fops)
#endif
@@ -4973,6 +4973,33 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
return 0;
}
/*
* The private pointer of the inode is the trace_event_file.
* Update the tr ref count associated to it.
*/
int tracing_open_file_tr(struct inode *inode, struct file *filp)
{
struct trace_event_file *file = inode->i_private;
int ret;
ret = tracing_check_open_get_tr(file->tr);
if (ret)
return ret;
filp->private_data = inode->i_private;
return 0;
}
int tracing_release_file_tr(struct inode *inode, struct file *filp)
{
struct trace_event_file *file = inode->i_private;
trace_array_put(file->tr);
return 0;
}
static int tracing_mark_open(struct inode *inode, struct file *filp)
{
stream_open(inode, filp);
@@ -6691,14 +6718,18 @@ static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
struct trace_array *tr = filp->private_data;
return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
}
static ssize_t
tracing_max_lat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
struct trace_array *tr = filp->private_data;
return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
}
#endif
@@ -7752,18 +7783,20 @@ static const struct file_operations tracing_thresh_fops = {
#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.open = tracing_open_generic_tr,
.read = tracing_max_lat_read,
.write = tracing_max_lat_write,
.llseek = generic_file_llseek,
.release = tracing_release_generic_tr,
};
#endif
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic,
.open = tracing_open_generic_tr,
.read = tracing_set_trace_read,
.write = tracing_set_trace_write,
.llseek = generic_file_llseek,
.release = tracing_release_generic_tr,
};
static const struct file_operations tracing_pipe_fops = {
@@ -8956,12 +8989,33 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
static int tracing_open_options(struct inode *inode, struct file *filp)
{
struct trace_option_dentry *topt = inode->i_private;
int ret;
ret = tracing_check_open_get_tr(topt->tr);
if (ret)
return ret;
filp->private_data = inode->i_private;
return 0;
}
static int tracing_release_options(struct inode *inode, struct file *file)
{
struct trace_option_dentry *topt = file->private_data;
trace_array_put(topt->tr);
return 0;
}
static const struct file_operations trace_options_fops = {
.open = tracing_open_generic,
.open = tracing_open_options,
.read = trace_options_read,
.write = trace_options_write,
.llseek = generic_file_llseek,
.release = tracing_release_options,
};
/*
@@ -9739,8 +9793,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
tr, &tracing_mark_fops);
file = __find_event_file(tr, "ftrace", "print");
if (file && file->dir)
trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
if (file && file->ef)
eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
file, &event_trigger_fops);
tr->trace_marker_file = file;

View File

@@ -610,6 +610,8 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);

View File

@@ -992,19 +992,6 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
static void remove_event_file_dir(struct trace_event_file *file)
{
struct dentry *dir = file->dir;
struct dentry *child;
if (dir) {
spin_lock(&dir->d_lock); /* probably unneeded */
list_for_each_entry(child, &dir->d_subdirs, d_child) {
if (d_really_is_positive(child)) /* probably unneeded */
d_inode(child)->i_private = NULL;
}
spin_unlock(&dir->d_lock);
tracefs_remove(dir);
}
eventfs_remove(file->ef);
list_del(&file->list);
remove_subsystem(file->system);
@@ -2103,9 +2090,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = {
};
static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.open = tracing_open_file_tr,
.read = event_enable_read,
.write = event_enable_write,
.release = tracing_release_file_tr,
.llseek = default_llseek,
};
@@ -2122,9 +2110,10 @@ static const struct file_operations ftrace_event_id_fops = {
};
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_generic,
.open = tracing_open_file_tr,
.read = event_filter_read,
.write = event_filter_write,
.release = tracing_release_file_tr,
.llseek = default_llseek,
};
@@ -2297,6 +2286,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
{
struct event_subsystem *system, *iter;
struct trace_subsystem_dir *dir;
struct eventfs_file *ef;
int res;
/* First see if we did not already create this dir */
@@ -2329,13 +2319,14 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
dir->ef = eventfs_add_subsystem_dir(name, parent);
if (IS_ERR(dir->ef)) {
ef = eventfs_add_subsystem_dir(name, parent);
if (IS_ERR(ef)) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
dir->ef = ef;
dir->tr = tr;
dir->ref_count = 1;
dir->nr_events = 1;
@@ -2415,6 +2406,7 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
struct trace_event_call *call = file->event_call;
struct eventfs_file *ef_subsystem = NULL;
struct trace_array *tr = file->tr;
struct eventfs_file *ef;
const char *name;
int ret;
@@ -2431,12 +2423,14 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
return -ENOMEM;
name = trace_event_name(call);
file->ef = eventfs_add_dir(name, ef_subsystem);
if (IS_ERR(file->ef)) {
ef = eventfs_add_dir(name, ef_subsystem);
if (IS_ERR(ef)) {
pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
file->ef = ef;
if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
&ftrace_enable_fops);
@@ -2776,6 +2770,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
update_event_fields(call, map[i]);
}
}
cond_resched();
}
up_write(&trace_event_sem);
}

View File

@@ -328,7 +328,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size,
}
const struct file_operations event_inject_fops = {
.open = tracing_open_generic,
.open = tracing_open_file_tr,
.read = event_inject_read,
.write = event_inject_write,
.release = tracing_release_file_tr,
};

View File

@@ -337,7 +337,7 @@ static void print_synth_event_num_val(struct trace_seq *s,
break;
default:
trace_seq_printf(s, print_fmt, name, val, space);
trace_seq_printf(s, print_fmt, name, val->as_u64, space);
break;
}
}

View File

@@ -127,8 +127,13 @@ struct user_event_enabler {
/* Bit 7 is for freeing status of enablement */
#define ENABLE_VAL_FREEING_BIT 7
/* Only duplicate the bit value */
#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
/* Bit 8 is for marking 32-bit on 64-bit */
#define ENABLE_VAL_32_ON_64_BIT 8
#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
/* Only duplicate the bit and compat values */
#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
#define ENABLE_BITOPS(e) (&(e)->values)
@@ -174,6 +179,30 @@ struct user_event_validator {
int flags;
};
static inline void align_addr_bit(unsigned long *addr, int *bit,
unsigned long *flags)
{
if (IS_ALIGNED(*addr, sizeof(long))) {
#ifdef __BIG_ENDIAN
/* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
*bit += 32;
#endif
return;
}
*addr = ALIGN_DOWN(*addr, sizeof(long));
/*
* We only support 32 and 64 bit values. The only time we need
* to align is a 32 bit value on a 64 bit kernel, which on LE
* is always 32 bits, and on BE requires no change when unaligned.
*/
#ifdef __LITTLE_ENDIAN
*bit += 32;
#endif
}
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
void *tpdata, bool *faulted);
@@ -482,6 +511,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
unsigned long *ptr;
struct page *page;
void *kaddr;
int bit = ENABLE_BIT(enabler);
int ret;
lockdep_assert_held(&event_mutex);
@@ -497,6 +527,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
return -EBUSY;
align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
&page, NULL);
@@ -515,9 +547,9 @@ static int user_event_enabler_write(struct user_event_mm *mm,
/* Update bit atomically, user tracers must be atomic as well */
if (enabler->event && enabler->event->status)
set_bit(ENABLE_BIT(enabler), ptr);
set_bit(bit, ptr);
else
clear_bit(ENABLE_BIT(enabler), ptr);
clear_bit(bit, ptr);
kunmap_local(kaddr);
unpin_user_pages_dirty_lock(&page, 1, true);
@@ -849,6 +881,12 @@ static struct user_event_enabler
enabler->event = user;
enabler->addr = uaddr;
enabler->values = reg->enable_bit;
#if BITS_PER_LONG >= 64
if (reg->enable_size == 4)
set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
#endif
retry:
/* Prevents state changes from racing with new enablers */
mutex_lock(&event_mutex);
@@ -2377,7 +2415,8 @@ static long user_unreg_get(struct user_unreg __user *ureg,
}
static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
unsigned long uaddr, unsigned char bit)
unsigned long uaddr, unsigned char bit,
unsigned long flags)
{
struct user_event_enabler enabler;
int result;
@@ -2385,7 +2424,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
memset(&enabler, 0, sizeof(enabler));
enabler.addr = uaddr;
enabler.values = bit;
enabler.values = bit | flags;
retry:
/* Prevents state changes from racing with new enablers */
mutex_lock(&event_mutex);
@@ -2415,6 +2454,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
struct user_event_mm *mm = current->user_event_mm;
struct user_event_enabler *enabler, *next;
struct user_unreg reg;
unsigned long flags;
long ret;
ret = user_unreg_get(ureg, &reg);
@@ -2425,6 +2465,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
if (!mm)
return -ENOENT;
flags = 0;
ret = -ENOENT;
/*
@@ -2441,6 +2482,9 @@ static long user_events_ioctl_unreg(unsigned long uarg)
ENABLE_BIT(enabler) == reg.disable_bit) {
set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
/* We must keep compat flags for the clear */
flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
user_event_enabler_destroy(enabler, true);
@@ -2454,7 +2498,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
/* Ensure bit is now cleared for user, regardless of event status */
if (!ret)
ret = user_event_mm_clear_bit(mm, reg.disable_addr,
reg.disable_bit);
reg.disable_bit, flags);
return ret;
}

View File

@@ -6535,9 +6535,6 @@ void __init workqueue_init_early(void)
BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
wq_update_pod_attrs_buf = alloc_workqueue_attrs();
BUG_ON(!wq_update_pod_attrs_buf);
pt->nr_pods = 1;
cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
pt->pod_node[0] = NUMA_NO_NODE;
@@ -6605,13 +6602,13 @@ static void __init wq_cpu_intensive_thresh_init(void)
unsigned long thresh;
unsigned long bogo;
pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
BUG_ON(IS_ERR(pwq_release_worker));
/* if the user set it to a specific value, keep it */
if (wq_cpu_intensive_thresh_us != ULONG_MAX)
return;
pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
BUG_ON(IS_ERR(pwq_release_worker));
/*
* The default of 10ms is derived from the fact that most modern (as of
* 2023) processors can do a lot in 10ms and that it's just below what