mirror of
https://github.com/torvalds/linux.git
synced 2026-05-05 23:05:25 -04:00
Merge tag 'v6.6-rc5' into locking/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
@@ -8501,7 +8501,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
|
||||
tname = btf_name_by_offset(btf, walk_type->name_off);
|
||||
|
||||
ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
|
||||
if (ret < 0)
|
||||
if (ret >= sizeof(safe_tname))
|
||||
return false;
|
||||
|
||||
safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
|
||||
|
||||
@@ -785,7 +785,8 @@ found:
|
||||
* to descendants
|
||||
* @cgrp: The cgroup which descendants to traverse
|
||||
* @link: A link for which to replace BPF program
|
||||
* @type: Type of attach operation
|
||||
* @new_prog: &struct bpf_prog for the target BPF program with its refcnt
|
||||
* incremented
|
||||
*
|
||||
* Must be called with cgroup_mutex held.
|
||||
*/
|
||||
@@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
|
||||
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
|
||||
* @sk: The socket sending or receiving traffic
|
||||
* @skb: The skb that is being sent or received
|
||||
* @type: The type of program to be executed
|
||||
* @atype: The type of program to be executed
|
||||
*
|
||||
* If no socket is passed, or the socket is not of type INET or INET6,
|
||||
* this function does nothing and returns 0.
|
||||
@@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
|
||||
/**
|
||||
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
|
||||
* @sk: sock structure to manipulate
|
||||
* @type: The type of program to be executed
|
||||
* @atype: The type of program to be executed
|
||||
*
|
||||
* socket is passed is expected to be of type INET or INET6.
|
||||
*
|
||||
@@ -1449,7 +1450,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
|
||||
* provided by user sockaddr
|
||||
* @sk: sock struct that will use sockaddr
|
||||
* @uaddr: sockaddr struct provided by user
|
||||
* @type: The type of program to be executed
|
||||
* @atype: The type of program to be executed
|
||||
* @t_ctx: Pointer to attach type specific context
|
||||
* @flags: Pointer to u32 which contains higher bits of BPF program
|
||||
* return value (OR'ed together).
|
||||
@@ -1496,7 +1497,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
|
||||
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
|
||||
* sk with connection information (IP addresses, etc.) May not contain
|
||||
* cgroup info if it is a req sock.
|
||||
* @type: The type of program to be executed
|
||||
* @atype: The type of program to be executed
|
||||
*
|
||||
* socket passed is expected to be of type INET or INET6.
|
||||
*
|
||||
@@ -1670,7 +1671,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
|
||||
* @ppos: value-result argument: value is position at which read from or write
|
||||
* to sysctl is happening, result is new position if program overrode it,
|
||||
* initial value otherwise
|
||||
* @type: type of program to be executed
|
||||
* @atype: type of program to be executed
|
||||
*
|
||||
* Program is run when sysctl is being accessed, either read or written, and
|
||||
* can allow or deny such access.
|
||||
|
||||
@@ -459,8 +459,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
|
||||
* Typical case will be between 11K and 116K closer to 11K.
|
||||
* bpf progs can and should share bpf_mem_cache when possible.
|
||||
*/
|
||||
|
||||
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
|
||||
static void init_refill_work(struct bpf_mem_cache *c)
|
||||
{
|
||||
init_irq_work(&c->refill_work, bpf_mem_refill);
|
||||
if (c->unit_size <= 256) {
|
||||
@@ -476,7 +475,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
|
||||
c->high_watermark = max(96 * 256 / c->unit_size, 3);
|
||||
}
|
||||
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
|
||||
}
|
||||
|
||||
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
|
||||
{
|
||||
/* To avoid consuming memory assume that 1st run of bpf
|
||||
* prog won't be doing more than 4 map_update_elem from
|
||||
* irq disabled region
|
||||
@@ -484,6 +486,31 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
|
||||
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
|
||||
}
|
||||
|
||||
static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
|
||||
{
|
||||
struct llist_node *first;
|
||||
unsigned int obj_size;
|
||||
|
||||
/* For per-cpu allocator, the size of free objects in free list doesn't
|
||||
* match with unit_size and now there is no way to get the size of
|
||||
* per-cpu pointer saved in free object, so just skip the checking.
|
||||
*/
|
||||
if (c->percpu_size)
|
||||
return 0;
|
||||
|
||||
first = c->free_llist.first;
|
||||
if (!first)
|
||||
return 0;
|
||||
|
||||
obj_size = ksize(first);
|
||||
if (obj_size != c->unit_size) {
|
||||
WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
|
||||
idx, obj_size, c->unit_size);
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* When size != 0 bpf_mem_cache for each cpu.
|
||||
* This is typical bpf hash map use case when all elements have equal size.
|
||||
*
|
||||
@@ -494,10 +521,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
|
||||
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
|
||||
{
|
||||
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
|
||||
int cpu, i, err, unit_size, percpu_size = 0;
|
||||
struct bpf_mem_caches *cc, __percpu *pcc;
|
||||
struct bpf_mem_cache *c, __percpu *pc;
|
||||
struct obj_cgroup *objcg = NULL;
|
||||
int cpu, i, unit_size, percpu_size = 0;
|
||||
|
||||
if (size) {
|
||||
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
|
||||
@@ -521,6 +548,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
|
||||
c->objcg = objcg;
|
||||
c->percpu_size = percpu_size;
|
||||
c->tgt = c;
|
||||
init_refill_work(c);
|
||||
prefill_mem_cache(c, cpu);
|
||||
}
|
||||
ma->cache = pc;
|
||||
@@ -534,6 +562,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
|
||||
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
|
||||
if (!pcc)
|
||||
return -ENOMEM;
|
||||
err = 0;
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
objcg = get_obj_cgroup_from_current();
|
||||
#endif
|
||||
@@ -544,11 +573,30 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
|
||||
c->unit_size = sizes[i];
|
||||
c->objcg = objcg;
|
||||
c->tgt = c;
|
||||
|
||||
init_refill_work(c);
|
||||
/* Another bpf_mem_cache will be used when allocating
|
||||
* c->unit_size in bpf_mem_alloc(), so doesn't prefill
|
||||
* for the bpf_mem_cache because these free objects will
|
||||
* never be used.
|
||||
*/
|
||||
if (i != bpf_mem_cache_idx(c->unit_size))
|
||||
continue;
|
||||
prefill_mem_cache(c, cpu);
|
||||
err = check_obj_size(c, i);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
ma->caches = pcc;
|
||||
return 0;
|
||||
/* refill_work is either zeroed or initialized, so it is safe to
|
||||
* call irq_work_sync().
|
||||
*/
|
||||
if (err)
|
||||
bpf_mem_alloc_destroy(ma);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void drain_mem_cache(struct bpf_mem_cache *c)
|
||||
@@ -916,3 +964,35 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
|
||||
|
||||
return !ret ? NULL : ret + LLIST_NODE_SZ;
|
||||
}
|
||||
|
||||
static __init int bpf_mem_cache_adjust_size(void)
|
||||
{
|
||||
unsigned int size;
|
||||
|
||||
/* Adjusting the indexes in size_index() according to the object_size
|
||||
* of underlying slab cache, so bpf_mem_alloc() will select a
|
||||
* bpf_mem_cache with unit_size equal to the object_size of
|
||||
* the underlying slab cache.
|
||||
*
|
||||
* The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is
|
||||
* 256-bytes, so only do adjustment for [8-bytes, 192-bytes].
|
||||
*/
|
||||
for (size = 192; size >= 8; size -= 8) {
|
||||
unsigned int kmalloc_size, index;
|
||||
|
||||
kmalloc_size = kmalloc_size_roundup(size);
|
||||
if (kmalloc_size == size)
|
||||
continue;
|
||||
|
||||
if (kmalloc_size <= 192)
|
||||
index = size_index[(kmalloc_size - 1) / 8];
|
||||
else
|
||||
index = fls(kmalloc_size - 1) - 1;
|
||||
/* Only overwrite if necessary */
|
||||
if (size_index[(size - 1) / 8] != index)
|
||||
size_index[(size - 1) / 8] = index;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(bpf_mem_cache_adjust_size);
|
||||
|
||||
@@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry,
|
||||
goto out;
|
||||
}
|
||||
idx = tidx;
|
||||
} else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
|
||||
ret = -ERANGE;
|
||||
goto out;
|
||||
}
|
||||
if (flags & BPF_F_BEFORE) {
|
||||
tidx = bpf_mprog_pos_before(entry, &rtuple);
|
||||
|
||||
@@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n
|
||||
offload->netdev = netdev;
|
||||
|
||||
ondev = bpf_offload_find_netdev(offload->netdev);
|
||||
/* When program is offloaded require presence of "true"
|
||||
* bpf_offload_netdev, avoid the one created for !ondev case below.
|
||||
*/
|
||||
if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
|
||||
err = -EINVAL;
|
||||
goto err_free;
|
||||
}
|
||||
if (!ondev) {
|
||||
if (bpf_prog_is_offloaded(prog->aux)) {
|
||||
err = -EINVAL;
|
||||
goto err_free;
|
||||
}
|
||||
|
||||
/* When only binding to the device, explicitly
|
||||
* create an entry in the hashtable.
|
||||
*/
|
||||
|
||||
@@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
|
||||
int err = 0;
|
||||
void *ptr;
|
||||
|
||||
raw_spin_lock_irqsave(&qs->lock, flags);
|
||||
if (in_nmi()) {
|
||||
if (!raw_spin_trylock_irqsave(&qs->lock, flags))
|
||||
return -EBUSY;
|
||||
} else {
|
||||
raw_spin_lock_irqsave(&qs->lock, flags);
|
||||
}
|
||||
|
||||
if (queue_stack_map_is_empty(qs)) {
|
||||
memset(value, 0, qs->map.value_size);
|
||||
@@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
|
||||
void *ptr;
|
||||
u32 index;
|
||||
|
||||
raw_spin_lock_irqsave(&qs->lock, flags);
|
||||
if (in_nmi()) {
|
||||
if (!raw_spin_trylock_irqsave(&qs->lock, flags))
|
||||
return -EBUSY;
|
||||
} else {
|
||||
raw_spin_lock_irqsave(&qs->lock, flags);
|
||||
}
|
||||
|
||||
if (queue_stack_map_is_empty(qs)) {
|
||||
memset(value, 0, qs->map.value_size);
|
||||
@@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
|
||||
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
|
||||
return -EINVAL;
|
||||
|
||||
raw_spin_lock_irqsave(&qs->lock, irq_flags);
|
||||
if (in_nmi()) {
|
||||
if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
|
||||
return -EBUSY;
|
||||
} else {
|
||||
raw_spin_lock_irqsave(&qs->lock, irq_flags);
|
||||
}
|
||||
|
||||
if (queue_stack_map_is_full(qs)) {
|
||||
if (!replace) {
|
||||
|
||||
@@ -4047,11 +4047,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
|
||||
bitmap_from_u64(mask, bt_reg_mask(bt));
|
||||
for_each_set_bit(i, mask, 32) {
|
||||
reg = &st->frame[0]->regs[i];
|
||||
if (reg->type != SCALAR_VALUE) {
|
||||
bt_clear_reg(bt, i);
|
||||
continue;
|
||||
}
|
||||
reg->precise = true;
|
||||
bt_clear_reg(bt, i);
|
||||
if (reg->type == SCALAR_VALUE)
|
||||
reg->precise = true;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -739,6 +739,17 @@ subsys_initcall(crash_notes_memory_init);
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "crash hp: " fmt
|
||||
|
||||
/*
|
||||
* Different than kexec/kdump loading/unloading/jumping/shrinking which
|
||||
* usually rarely happen, there will be many crash hotplug events notified
|
||||
* during one short period, e.g one memory board is hot added and memory
|
||||
* regions are online. So mutex lock __crash_hotplug_lock is used to
|
||||
* serialize the crash hotplug handling specifically.
|
||||
*/
|
||||
DEFINE_MUTEX(__crash_hotplug_lock);
|
||||
#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
|
||||
#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
|
||||
|
||||
/*
|
||||
* This routine utilized when the crash_hotplug sysfs node is read.
|
||||
* It reflects the kernel's ability/permission to update the crash
|
||||
@@ -748,9 +759,11 @@ int crash_check_update_elfcorehdr(void)
|
||||
{
|
||||
int rc = 0;
|
||||
|
||||
crash_hotplug_lock();
|
||||
/* Obtain lock while reading crash information */
|
||||
if (!kexec_trylock()) {
|
||||
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
|
||||
crash_hotplug_unlock();
|
||||
return 0;
|
||||
}
|
||||
if (kexec_crash_image) {
|
||||
@@ -761,6 +774,7 @@ int crash_check_update_elfcorehdr(void)
|
||||
}
|
||||
/* Release lock now that update complete */
|
||||
kexec_unlock();
|
||||
crash_hotplug_unlock();
|
||||
|
||||
return rc;
|
||||
}
|
||||
@@ -783,9 +797,11 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
|
||||
{
|
||||
struct kimage *image;
|
||||
|
||||
crash_hotplug_lock();
|
||||
/* Obtain lock while changing crash information */
|
||||
if (!kexec_trylock()) {
|
||||
pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
|
||||
crash_hotplug_unlock();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -852,6 +868,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
|
||||
out:
|
||||
/* Release lock now that update complete */
|
||||
kexec_unlock();
|
||||
crash_hotplug_unlock();
|
||||
}
|
||||
|
||||
static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
|
||||
|
||||
@@ -399,14 +399,13 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
|
||||
}
|
||||
|
||||
mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
|
||||
default_nareas), SMP_CACHE_BYTES);
|
||||
nareas), SMP_CACHE_BYTES);
|
||||
if (!mem->areas) {
|
||||
pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false,
|
||||
default_nareas);
|
||||
swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
|
||||
add_mem_pool(&io_tlb_default_mem, mem);
|
||||
|
||||
if (flags & SWIOTLB_VERBOSE)
|
||||
@@ -729,9 +728,6 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
|
||||
}
|
||||
|
||||
add_mem_pool(mem, pool);
|
||||
|
||||
/* Pairs with smp_rmb() in is_swiotlb_buffer(). */
|
||||
smp_wmb();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1152,9 +1148,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
|
||||
spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
|
||||
|
||||
found:
|
||||
dev->dma_uses_io_tlb = true;
|
||||
/* Pairs with smp_rmb() in is_swiotlb_buffer() */
|
||||
smp_wmb();
|
||||
WRITE_ONCE(dev->dma_uses_io_tlb, true);
|
||||
|
||||
/*
|
||||
* The general barrier orders reads and writes against a presumed store
|
||||
* of the SWIOTLB buffer address by a device driver (to a driver private
|
||||
* data structure). It serves two purposes.
|
||||
*
|
||||
* First, the store to dev->dma_uses_io_tlb must be ordered before the
|
||||
* presumed store. This guarantees that the returned buffer address
|
||||
* cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
|
||||
*
|
||||
* Second, the load from mem->pools must be ordered before the same
|
||||
* presumed store. This guarantees that the returned buffer address
|
||||
* cannot be observed by another CPU before an update of the RCU list
|
||||
* that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
|
||||
* atomicity).
|
||||
*
|
||||
* See also the comment in is_swiotlb_buffer().
|
||||
*/
|
||||
smp_mb();
|
||||
|
||||
*retpool = pool;
|
||||
return index;
|
||||
|
||||
@@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags)
|
||||
}
|
||||
|
||||
/**
|
||||
* pidfd_open() - Open new pid file descriptor.
|
||||
* sys_pidfd_open() - Open new pid file descriptor.
|
||||
*
|
||||
* @pid: pid for which to retrieve a pidfd
|
||||
* @flags: flags to pass
|
||||
|
||||
@@ -2647,7 +2647,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
|
||||
memory_bm_free(bm, PG_UNSAFE_KEEP);
|
||||
|
||||
/* Make a copy of zero_bm so it can be created in safe pages */
|
||||
error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY);
|
||||
error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE);
|
||||
if (error)
|
||||
goto Free;
|
||||
|
||||
@@ -2660,7 +2660,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
|
||||
goto Free;
|
||||
|
||||
duplicate_memory_bitmap(zero_bm, &tmp);
|
||||
memory_bm_free(&tmp, PG_UNSAFE_KEEP);
|
||||
memory_bm_free(&tmp, PG_UNSAFE_CLEAR);
|
||||
/* At this point zero_bm is in safe pages and it can be used for restoring. */
|
||||
|
||||
if (nr_highmem > 0) {
|
||||
|
||||
@@ -9307,7 +9307,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
|
||||
* PF_KTHREAD should already be set at this point; regardless, make it
|
||||
* look like a proper per-CPU kthread.
|
||||
*/
|
||||
idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
|
||||
idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
|
||||
kthread_set_per_cpu(idle, cpu);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
@@ -350,7 +350,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
|
||||
* Except when the rq is capped by uclamp_max.
|
||||
*/
|
||||
if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
|
||||
sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
|
||||
sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
|
||||
!sg_policy->need_freq_update) {
|
||||
next_f = sg_policy->next_freq;
|
||||
|
||||
/* Restore cached freq as next_freq has changed */
|
||||
|
||||
@@ -101,6 +101,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
|
||||
if (lowest_mask) {
|
||||
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
|
||||
cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
|
||||
|
||||
/*
|
||||
* We have to ensure that we have at least one bit
|
||||
|
||||
@@ -664,6 +664,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
|
||||
cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
|
||||
}
|
||||
|
||||
/*
|
||||
* Specifically: avg_runtime() + 0 must result in entity_eligible() := true
|
||||
* For this to be so, the result of this function must have a left bias.
|
||||
*/
|
||||
u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct sched_entity *curr = cfs_rq->curr;
|
||||
@@ -677,8 +681,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
|
||||
load += weight;
|
||||
}
|
||||
|
||||
if (load)
|
||||
if (load) {
|
||||
/* sign flips effective floor / ceil */
|
||||
if (avg < 0)
|
||||
avg -= (load - 1);
|
||||
avg = div_s64(avg, load);
|
||||
}
|
||||
|
||||
return cfs_rq->min_vruntime + avg;
|
||||
}
|
||||
@@ -4919,10 +4927,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
||||
static void
|
||||
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
u64 vslice = calc_delta_fair(se->slice, se);
|
||||
u64 vruntime = avg_vruntime(cfs_rq);
|
||||
u64 vslice, vruntime = avg_vruntime(cfs_rq);
|
||||
s64 lag = 0;
|
||||
|
||||
se->slice = sysctl_sched_base_slice;
|
||||
vslice = calc_delta_fair(se->slice, se);
|
||||
|
||||
/*
|
||||
* Due to how V is constructed as the weighted average of entities,
|
||||
* adding tasks with positive lag, or removing tasks with negative lag
|
||||
|
||||
@@ -373,6 +373,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise);
|
||||
|
||||
void cpu_startup_entry(enum cpuhp_state state)
|
||||
{
|
||||
current->flags |= PF_IDLE;
|
||||
arch_cpu_idle_prepare();
|
||||
cpuhp_online_idle(state);
|
||||
while (1)
|
||||
|
||||
@@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
|
||||
* task_work_cancel_match - cancel a pending work added by task_work_add()
|
||||
* @task: the task which should execute the work
|
||||
* @match: match function to call
|
||||
* @data: data to be passed in to match function
|
||||
*
|
||||
* RETURNS:
|
||||
* The found work or NULL if not found.
|
||||
|
||||
@@ -2853,6 +2853,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
|
||||
return arr.mods_cnt;
|
||||
}
|
||||
|
||||
static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
|
||||
{
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < cnt; i++) {
|
||||
if (!within_error_injection_list(addrs[i]))
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
|
||||
{
|
||||
struct bpf_kprobe_multi_link *link = NULL;
|
||||
@@ -2930,6 +2941,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
|
||||
err = -EINVAL;
|
||||
goto error;
|
||||
}
|
||||
|
||||
link = kzalloc(sizeof(*link), GFP_KERNEL);
|
||||
if (!link) {
|
||||
err = -ENOMEM;
|
||||
@@ -3207,8 +3223,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
|
||||
rcu_read_lock();
|
||||
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
|
||||
rcu_read_unlock();
|
||||
if (!task)
|
||||
if (!task) {
|
||||
err = -ESRCH;
|
||||
goto error_path_put;
|
||||
}
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
|
||||
@@ -354,6 +354,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
|
||||
local_set(&bpage->commit, 0);
|
||||
}
|
||||
|
||||
static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
|
||||
{
|
||||
return local_read(&bpage->page->commit);
|
||||
}
|
||||
|
||||
static void free_buffer_page(struct buffer_page *bpage)
|
||||
{
|
||||
free_page((unsigned long)bpage->page);
|
||||
@@ -1132,6 +1137,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
|
||||
if (full) {
|
||||
poll_wait(filp, &work->full_waiters, poll_table);
|
||||
work->full_waiters_pending = true;
|
||||
if (!cpu_buffer->shortest_full ||
|
||||
cpu_buffer->shortest_full > full)
|
||||
cpu_buffer->shortest_full = full;
|
||||
} else {
|
||||
poll_wait(filp, &work->waiters, poll_table);
|
||||
work->waiters_pending = true;
|
||||
@@ -2003,7 +2011,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
|
||||
* Increment overrun to account for the lost events.
|
||||
*/
|
||||
local_add(page_entries, &cpu_buffer->overrun);
|
||||
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
|
||||
local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
|
||||
local_inc(&cpu_buffer->pages_lost);
|
||||
}
|
||||
|
||||
@@ -2367,11 +2375,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
cpu_buffer->reader_page->read);
|
||||
}
|
||||
|
||||
static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
|
||||
{
|
||||
return local_read(&bpage->page->commit);
|
||||
}
|
||||
|
||||
static struct ring_buffer_event *
|
||||
rb_iter_head_event(struct ring_buffer_iter *iter)
|
||||
{
|
||||
@@ -2517,7 +2520,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
|
||||
* the counters.
|
||||
*/
|
||||
local_add(entries, &cpu_buffer->overrun);
|
||||
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
|
||||
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
|
||||
local_inc(&cpu_buffer->pages_lost);
|
||||
|
||||
/*
|
||||
@@ -2660,9 +2663,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
|
||||
|
||||
event = __rb_page_index(tail_page, tail);
|
||||
|
||||
/* account for padding bytes */
|
||||
local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
|
||||
|
||||
/*
|
||||
* Save the original length to the meta data.
|
||||
* This will be used by the reader to add lost event
|
||||
@@ -2676,7 +2676,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
|
||||
* write counter enough to allow another writer to slip
|
||||
* in on this page.
|
||||
* We put in a discarded commit instead, to make sure
|
||||
* that this space is not used again.
|
||||
* that this space is not used again, and this space will
|
||||
* not be accounted into 'entries_bytes'.
|
||||
*
|
||||
* If we are less than the minimum size, we don't need to
|
||||
* worry about it.
|
||||
@@ -2701,6 +2702,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
|
||||
/* time delta must be non zero */
|
||||
event->time_delta = 1;
|
||||
|
||||
/* account for padding bytes */
|
||||
local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
|
||||
|
||||
/* Make sure the padding is visible before the tail_page->write update */
|
||||
smp_wmb();
|
||||
|
||||
@@ -4215,7 +4219,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
|
||||
EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
|
||||
|
||||
/**
|
||||
* ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
|
||||
* ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
|
||||
* @buffer: The ring buffer
|
||||
* @cpu: The per CPU buffer to read from.
|
||||
*/
|
||||
@@ -4723,6 +4727,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
|
||||
|
||||
length = rb_event_length(event);
|
||||
cpu_buffer->reader_page->read += length;
|
||||
cpu_buffer->read_bytes += length;
|
||||
}
|
||||
|
||||
static void rb_advance_iter(struct ring_buffer_iter *iter)
|
||||
@@ -5816,7 +5821,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
|
||||
} else {
|
||||
/* update the entry counter */
|
||||
cpu_buffer->read += rb_page_entries(reader);
|
||||
cpu_buffer->read_bytes += BUF_PAGE_SIZE;
|
||||
cpu_buffer->read_bytes += rb_page_commit(reader);
|
||||
|
||||
/* swap the pages */
|
||||
rb_init_page(bpage);
|
||||
|
||||
@@ -2770,6 +2770,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
|
||||
update_event_fields(call, map[i]);
|
||||
}
|
||||
}
|
||||
cond_resched();
|
||||
}
|
||||
up_write(&trace_event_sem);
|
||||
}
|
||||
|
||||
@@ -127,8 +127,13 @@ struct user_event_enabler {
|
||||
/* Bit 7 is for freeing status of enablement */
|
||||
#define ENABLE_VAL_FREEING_BIT 7
|
||||
|
||||
/* Only duplicate the bit value */
|
||||
#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
|
||||
/* Bit 8 is for marking 32-bit on 64-bit */
|
||||
#define ENABLE_VAL_32_ON_64_BIT 8
|
||||
|
||||
#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
|
||||
|
||||
/* Only duplicate the bit and compat values */
|
||||
#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
|
||||
|
||||
#define ENABLE_BITOPS(e) (&(e)->values)
|
||||
|
||||
@@ -174,6 +179,30 @@ struct user_event_validator {
|
||||
int flags;
|
||||
};
|
||||
|
||||
static inline void align_addr_bit(unsigned long *addr, int *bit,
|
||||
unsigned long *flags)
|
||||
{
|
||||
if (IS_ALIGNED(*addr, sizeof(long))) {
|
||||
#ifdef __BIG_ENDIAN
|
||||
/* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
|
||||
if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
|
||||
*bit += 32;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
*addr = ALIGN_DOWN(*addr, sizeof(long));
|
||||
|
||||
/*
|
||||
* We only support 32 and 64 bit values. The only time we need
|
||||
* to align is a 32 bit value on a 64 bit kernel, which on LE
|
||||
* is always 32 bits, and on BE requires no change when unaligned.
|
||||
*/
|
||||
#ifdef __LITTLE_ENDIAN
|
||||
*bit += 32;
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
|
||||
void *tpdata, bool *faulted);
|
||||
|
||||
@@ -482,6 +511,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
|
||||
unsigned long *ptr;
|
||||
struct page *page;
|
||||
void *kaddr;
|
||||
int bit = ENABLE_BIT(enabler);
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&event_mutex);
|
||||
@@ -497,6 +527,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
|
||||
test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
|
||||
return -EBUSY;
|
||||
|
||||
align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
|
||||
|
||||
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
|
||||
&page, NULL);
|
||||
|
||||
@@ -515,9 +547,9 @@ static int user_event_enabler_write(struct user_event_mm *mm,
|
||||
|
||||
/* Update bit atomically, user tracers must be atomic as well */
|
||||
if (enabler->event && enabler->event->status)
|
||||
set_bit(ENABLE_BIT(enabler), ptr);
|
||||
set_bit(bit, ptr);
|
||||
else
|
||||
clear_bit(ENABLE_BIT(enabler), ptr);
|
||||
clear_bit(bit, ptr);
|
||||
|
||||
kunmap_local(kaddr);
|
||||
unpin_user_pages_dirty_lock(&page, 1, true);
|
||||
@@ -849,6 +881,12 @@ static struct user_event_enabler
|
||||
enabler->event = user;
|
||||
enabler->addr = uaddr;
|
||||
enabler->values = reg->enable_bit;
|
||||
|
||||
#if BITS_PER_LONG >= 64
|
||||
if (reg->enable_size == 4)
|
||||
set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
|
||||
#endif
|
||||
|
||||
retry:
|
||||
/* Prevents state changes from racing with new enablers */
|
||||
mutex_lock(&event_mutex);
|
||||
@@ -2377,7 +2415,8 @@ static long user_unreg_get(struct user_unreg __user *ureg,
|
||||
}
|
||||
|
||||
static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
|
||||
unsigned long uaddr, unsigned char bit)
|
||||
unsigned long uaddr, unsigned char bit,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct user_event_enabler enabler;
|
||||
int result;
|
||||
@@ -2385,7 +2424,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
|
||||
|
||||
memset(&enabler, 0, sizeof(enabler));
|
||||
enabler.addr = uaddr;
|
||||
enabler.values = bit;
|
||||
enabler.values = bit | flags;
|
||||
retry:
|
||||
/* Prevents state changes from racing with new enablers */
|
||||
mutex_lock(&event_mutex);
|
||||
@@ -2415,6 +2454,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
|
||||
struct user_event_mm *mm = current->user_event_mm;
|
||||
struct user_event_enabler *enabler, *next;
|
||||
struct user_unreg reg;
|
||||
unsigned long flags;
|
||||
long ret;
|
||||
|
||||
ret = user_unreg_get(ureg, ®);
|
||||
@@ -2425,6 +2465,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
|
||||
if (!mm)
|
||||
return -ENOENT;
|
||||
|
||||
flags = 0;
|
||||
ret = -ENOENT;
|
||||
|
||||
/*
|
||||
@@ -2441,6 +2482,9 @@ static long user_events_ioctl_unreg(unsigned long uarg)
|
||||
ENABLE_BIT(enabler) == reg.disable_bit) {
|
||||
set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
|
||||
|
||||
/* We must keep compat flags for the clear */
|
||||
flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
|
||||
|
||||
if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
|
||||
user_event_enabler_destroy(enabler, true);
|
||||
|
||||
@@ -2454,7 +2498,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
|
||||
/* Ensure bit is now cleared for user, regardless of event status */
|
||||
if (!ret)
|
||||
ret = user_event_mm_clear_bit(mm, reg.disable_addr,
|
||||
reg.disable_bit);
|
||||
reg.disable_bit, flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -6535,9 +6535,6 @@ void __init workqueue_init_early(void)
|
||||
|
||||
BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
|
||||
|
||||
wq_update_pod_attrs_buf = alloc_workqueue_attrs();
|
||||
BUG_ON(!wq_update_pod_attrs_buf);
|
||||
|
||||
pt->nr_pods = 1;
|
||||
cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
|
||||
pt->pod_node[0] = NUMA_NO_NODE;
|
||||
@@ -6605,13 +6602,13 @@ static void __init wq_cpu_intensive_thresh_init(void)
|
||||
unsigned long thresh;
|
||||
unsigned long bogo;
|
||||
|
||||
pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
|
||||
BUG_ON(IS_ERR(pwq_release_worker));
|
||||
|
||||
/* if the user set it to a specific value, keep it */
|
||||
if (wq_cpu_intensive_thresh_us != ULONG_MAX)
|
||||
return;
|
||||
|
||||
pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
|
||||
BUG_ON(IS_ERR(pwq_release_worker));
|
||||
|
||||
/*
|
||||
* The default of 10ms is derived from the fact that most modern (as of
|
||||
* 2023) processors can do a lot in 10ms and that it's just below what
|
||||
|
||||
Reference in New Issue
Block a user