From 813f336269e629da5d9c86a8098d6bee3d84680e Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Wed, 15 Apr 2026 20:03:28 +0800 Subject: [PATCH 01/32] selftests/bpf: Fix timer_start_deadlock failure due to hrtimer change Since commit f2e388a019e4 ("hrtimer: Reduce trace noise in hrtimer_start()"), hrtimer_cancel tracepoint is no longer called when a hrtimer is re-armed. So instead of a hrtimer_cancel followed by hrtimer_start tracepoint events, there is now only a since hrtimer_start tracepoint event with the new was_armed field set to 1, to indicated that the hrtimer was previously armed. Update timer_start_deadlock accordingly so it traces hrtimer_start tracepoint instead, with was_armed used as guard. Signed-off-by: Shung-Hsi Yu Tested-by: Mykyta Yatsenko Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260415120329.129192-1-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/timer_start_deadlock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c index 019518ee18cd..afabd15bdac4 100644 --- a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c +++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c @@ -27,13 +27,13 @@ static int timer_cb(void *map, int *key, struct elem *value) return 0; } -SEC("tp_btf/hrtimer_cancel") -int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) +SEC("tp_btf/hrtimer_start") +int BPF_PROG(tp_hrtimer_start, struct hrtimer *hrtimer, enum hrtimer_mode mode, bool was_armed) { struct bpf_timer *timer; int key = 0; - if (!in_timer_start) + if (!in_timer_start || !was_armed) return 0; tp_called = 1; @@ -60,7 +60,7 @@ int start_timer(void *ctx) /* * call hrtimer_start() twice, so that 2nd call does - * remove_hrtimer() and trace_hrtimer_cancel() tracepoint. + * trace_hrtimer_start(was_armed=1) tracepoint. */ in_timer_start = 1; bpf_timer_start(timer, 1000000000, 0); From ecdd4fd8a54ca4679ab8676674a2388ea37eee1a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 13 Apr 2026 16:30:52 -0700 Subject: [PATCH 02/32] bpf: fix arg tracking for imprecise/multi-offset BPF_ST/STX BPF_STX through ARG_IMPRECISE dst should be recognized as a local spill and join at_stack with the written value. For example, consider the following situation: // r1 = ARG_IMPRECISE{mask=BIT(0)|BIT(1)} *(u64 *)(r1 + 0) = r8 Here the analysis should produce an equivalent of at_stack[*] = join(old, r8) BPF_ST through multi-offset or imprecise dst should join at_stack with none instead of overwriting the slots. For example, consider the following situation: // r1 = ARG_IMPRECISE{mask=BIT(0)|BIT(1)} *(u64 *)(r1 + 0) = 0 Here the analysis should produce an equivalent of at_stack[*r1] = join(old, none). Move the definition of the clear_overlapping_stack_slots() in order to have __arg_track_join() visible. Remove the OFF_IMPRECISE constant to avoid having two ways to express imprecise offset. Only 'offset-imprecise {frame=N, cnt=0}' remains. Fixes: bf0c571f7feb ("bpf: introduce forward arg-tracking dataflow analysis") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260413-stacklive-fixes-v2-1-398e126e5cf3@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 112 +++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 51 deletions(-) diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 1fb4c511db5a..332e6e003f27 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -574,7 +574,7 @@ static int print_instances(struct bpf_verifier_env *env) * * precise {frame=N, off=V} -- known absolute frame index and byte offset * | - * offset-imprecise {frame=N, off=OFF_IMPRECISE} + * offset-imprecise {frame=N, cnt=0} * | -- known frame identity, unknown offset * fully-imprecise {frame=ARG_IMPRECISE, mask=bitmask} * -- unknown frame identity; .mask is a @@ -607,8 +607,6 @@ enum arg_track_state { ARG_IMPRECISE = -3, /* lost identity; .mask is arg bitmask */ }; -#define OFF_IMPRECISE S16_MIN /* arg identity known but offset unknown */ - /* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */ #define MAX_ARG_SPILL_SLOTS 64 @@ -622,28 +620,6 @@ static bool arg_is_fp(const struct arg_track *at) return at->frame >= 0 || at->frame == ARG_IMPRECISE; } -/* - * Clear all tracked callee stack slots overlapping the byte range - * [off, off+sz-1] where off is a negative FP-relative offset. - */ -static void clear_overlapping_stack_slots(struct arg_track *at_stack, s16 off, u32 sz) -{ - struct arg_track none = { .frame = ARG_NONE }; - - if (off == OFF_IMPRECISE) { - for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) - at_stack[i] = none; - return; - } - for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { - int slot_start = -((i + 1) * 8); - int slot_end = slot_start + 8; - - if (slot_start < off + (int)sz && slot_end > off) - at_stack[i] = none; - } -} - static void verbose_arg_track(struct bpf_verifier_env *env, struct arg_track *at) { int i; @@ -863,16 +839,13 @@ static void arg_track_alu64(struct arg_track *dst, const struct arg_track *src) *dst = arg_join_imprecise(*dst, *src); } -static s16 arg_add(s16 off, s64 delta) +static bool arg_add(s16 off, s64 delta, s16 *out) { - s64 res; + s16 d = delta; - if (off == OFF_IMPRECISE) - return OFF_IMPRECISE; - res = (s64)off + delta; - if (res < S16_MIN + 1 || res > S16_MAX) - return OFF_IMPRECISE; - return res; + if (d != delta) + return true; + return check_add_overflow(off, d, out); } static void arg_padd(struct arg_track *at, s64 delta) @@ -882,9 +855,9 @@ static void arg_padd(struct arg_track *at, s64 delta) if (at->off_cnt == 0) return; for (i = 0; i < at->off_cnt; i++) { - s16 new_off = arg_add(at->off[i], delta); + s16 new_off; - if (new_off == OFF_IMPRECISE) { + if (arg_add(at->off[i], delta, &new_off)) { at->off_cnt = 0; return; } @@ -899,8 +872,6 @@ static void arg_padd(struct arg_track *at, s64 delta) */ static int fp_off_to_slot(s16 off) { - if (off == OFF_IMPRECISE) - return -1; if (off >= 0 || off < -(int)(MAX_ARG_SPILL_SLOTS * 8)) return -1; if (off % 8) @@ -930,9 +901,11 @@ static struct arg_track fill_from_stack(struct bpf_insn *insn, return imp; for (i = 0; i < cnt; i++) { - s16 fp_off = arg_add(at_out[reg].off[i], insn->off); - int slot = fp_off_to_slot(fp_off); + s16 fp_off, slot; + if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) + return imp; + slot = fp_off_to_slot(fp_off); if (slot < 0) return imp; result = __arg_track_join(result, at_stack_out[slot]); @@ -968,9 +941,12 @@ static void spill_to_stack(struct bpf_insn *insn, struct arg_track *at_out, return; } for (i = 0; i < cnt; i++) { - s16 fp_off = arg_add(at_out[reg].off[i], insn->off); - int slot = fp_off_to_slot(fp_off); + s16 fp_off; + int slot; + if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) + continue; + slot = fp_off_to_slot(fp_off); if (slot < 0) continue; if (cnt == 1) @@ -980,6 +956,32 @@ static void spill_to_stack(struct bpf_insn *insn, struct arg_track *at_out, } } +/* + * Clear all tracked callee stack slots overlapping the byte range + * [off, off+sz-1] where off is a negative FP-relative offset. + */ +static void clear_overlapping_stack_slots(struct arg_track *at_stack, s16 off, u32 sz, int cnt) +{ + struct arg_track none = { .frame = ARG_NONE }; + + if (cnt == 0) { + for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) + at_stack[i] = __arg_track_join(at_stack[i], none); + return; + } + for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { + int slot_start = -((i + 1) * 8); + int slot_end = slot_start + 8; + + if (slot_start < off + (int)sz && slot_end > off) { + if (cnt == 1) + at_stack[i] = none; + else + at_stack[i] = __arg_track_join(at_stack[i], none); + } + } +} + /* * Clear stack slots overlapping all possible FP offsets in @reg. */ @@ -990,18 +992,22 @@ static void clear_stack_for_all_offs(struct bpf_insn *insn, int cnt, i; if (reg == BPF_REG_FP) { - clear_overlapping_stack_slots(at_stack_out, insn->off, sz); + clear_overlapping_stack_slots(at_stack_out, insn->off, sz, 1); return; } cnt = at_out[reg].off_cnt; if (cnt == 0) { - clear_overlapping_stack_slots(at_stack_out, OFF_IMPRECISE, sz); + clear_overlapping_stack_slots(at_stack_out, 0, sz, cnt); return; } for (i = 0; i < cnt; i++) { - s16 fp_off = arg_add(at_out[reg].off[i], insn->off); + s16 fp_off; - clear_overlapping_stack_slots(at_stack_out, fp_off, sz); + if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) { + clear_overlapping_stack_slots(at_stack_out, 0, sz, 0); + break; + } + clear_overlapping_stack_slots(at_stack_out, fp_off, sz, cnt); } } @@ -1042,6 +1048,12 @@ static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, i verbose(env, "\n"); } +static bool can_be_local_fp(int depth, int regno, struct arg_track *at) +{ + return regno == BPF_REG_FP || at->frame == depth || + (at->frame == ARG_IMPRECISE && (at->mask & BIT(depth))); +} + /* * Pure dataflow transfer function for arg_track state. * Updates at_out[] based on how the instruction modifies registers. @@ -1111,8 +1123,7 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, at_out[r] = none; } else if (class == BPF_LDX) { u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code)); - bool src_is_local_fp = insn->src_reg == BPF_REG_FP || src->frame == depth || - (src->frame == ARG_IMPRECISE && (src->mask & BIT(depth))); + bool src_is_local_fp = can_be_local_fp(depth, insn->src_reg, src); /* * Reload from callee stack: if src is current-frame FP-derived @@ -1147,7 +1158,7 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, bool dst_is_local_fp; /* Track spills to current-frame FP-derived callee stack */ - dst_is_local_fp = insn->dst_reg == BPF_REG_FP || dst->frame == depth; + dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst); if (dst_is_local_fp && BPF_MODE(insn->code) == BPF_MEM) spill_to_stack(insn, at_out, insn->dst_reg, at_stack_out, src, sz); @@ -1166,7 +1177,7 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, } } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) { u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code)); - bool dst_is_local_fp = insn->dst_reg == BPF_REG_FP || dst->frame == depth; + bool dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst); /* BPF_ST to FP-derived dst: clear overlapping stack slots */ if (dst_is_local_fp) @@ -1316,8 +1327,7 @@ static int record_load_store_access(struct bpf_verifier_env *env, resolved.off_cnt = ptr->off_cnt; resolved.frame = ptr->frame; for (oi = 0; oi < ptr->off_cnt; oi++) { - resolved.off[oi] = arg_add(ptr->off[oi], insn->off); - if (resolved.off[oi] == OFF_IMPRECISE) { + if (arg_add(ptr->off[oi], insn->off, &resolved.off[oi])) { resolved.off_cnt = 0; break; } From d97cc8fc997c77234580c77b21466164ff71307a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 13 Apr 2026 16:30:53 -0700 Subject: [PATCH 03/32] selftests/bpf: arg tracking for imprecise/multi-offset BPF_ST/STX Add test cases for clear_stack_for_all_offs and dst_is_local_fp handling of multi-offset and ARG_IMPRECISE stack pointers: - st_imm_join_with_multi_off: BPF_ST through multi-offset dst should join at_stack with none instead of overwriting both candidate slots. - st_imm_join_with_imprecise_off: BPF_ST through offset-imprecise dst should join at_stack with none instead of clearing all slots. - st_imm_join_with_single_off: a canary checking that BPF_ST with a known offset overwrites slot instead of joining. - imprecise_dst_spill_join: BPF_STX through ARG_IMPRECISE dst should be recognized as a local spill and join at_stack with the written value. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260413-stacklive-fixes-v2-2-398e126e5cf3@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_live_stack.c | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c index b7a9fa10e84d..401152b2b64f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_live_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c @@ -2647,3 +2647,196 @@ __naked void spill_join_with_imprecise_off(void) "exit;" ::: __clobber_all); } + +/* + * Same as spill_join_with_multi_off but the write is BPF_ST (store + * immediate) instead of BPF_STX. BPF_ST goes through + * clear_stack_for_all_offs() rather than spill_to_stack(), and that + * path also needs to join instead of overwriting. + * + * fp-8 = &fp-24 + * fp-16 = &fp-32 + * r1 = fp-8 or fp-16 (two offsets from branch) + * *(u64 *)(r1 + 0) = 0 -- BPF_ST with immediate + * r0 = *(u64 *)(r10 - 16) -- fill from fp-16 + * r0 = *(u64 *)(r0 + 0) -- deref: should produce use + */ +SEC("socket") +__log_level(2) +__failure +__msg("15: (7a) *(u64 *)(r1 +0) = 0 fp-8: fp0-24 -> fp0-24|fp0+0 fp-16: fp0-32 -> fp0-32|fp0+0") +__msg("17: (79) r0 = *(u64 *)(r0 +0) ; use: fp0-32") +__naked void st_imm_join_with_multi_off(void) +{ + asm volatile ( + "*(u64 *)(r10 - 24) = 0;" + "*(u64 *)(r10 - 32) = 0;" + "r1 = r10;" + "r1 += -24;" + "*(u64 *)(r10 - 8) = r1;" + "r1 = r10;" + "r1 += -32;" + "*(u64 *)(r10 - 16) = r1;" + /* create r1 with two candidate offsets: fp-8 or fp-16 */ + "call %[bpf_get_prandom_u32];" + "if r0 == 0 goto 1f;" + "r1 = r10;" + "r1 += -8;" + "goto 2f;" +"1:" + "r1 = r10;" + "r1 += -16;" +"2:" + /* BPF_ST: store immediate through multi-offset r1 */ + "*(u64 *)(r1 + 0) = 0;" + /* read back fp-16 and deref */ + "r0 = *(u64 *)(r10 - 16);" + "r0 = *(u64 *)(r0 + 0);" + "r0 = 0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Check that BPF_ST with a known offset fully overwrites stack slot + * from the arg tracking point of view. + */ +SEC("socket") +__log_level(2) +__success +__msg("5: (7a) *(u64 *)(r1 +0) = 0 fp-8: fp0-16 -> _{{$}}") +__naked void st_imm_join_with_single_off(void) +{ + asm volatile ( + "r2 = r10;" + "r2 += -16;" + "*(u64 *)(r10 - 8) = r2;" + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r1 + 0) = 0;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +/* + * Same as spill_join_with_imprecise_off but the write is BPF_ST. + * Use "r2 = -8; r1 += r2" to make arg tracking lose offset + * precision while the main verifier keeps r1 as fixed-offset. + * + * fp-8 = &fp-24 + * fp-16 = &fp-32 + * r1 = fp-8 (imprecise to arg tracking) + * *(u64 *)(r1 + 0) = 0 -- BPF_ST with immediate + * r0 = *(u64 *)(r10 - 16) -- fill from fp-16 + * r0 = *(u64 *)(r0 + 0) -- deref: should produce use + */ +SEC("socket") +__log_level(2) +__success +__msg("13: (79) r0 = *(u64 *)(r0 +0) ; use: fp0-32") +__naked void st_imm_join_with_imprecise_off(void) +{ + asm volatile ( + "*(u64 *)(r10 - 24) = 0;" + "*(u64 *)(r10 - 32) = 0;" + "r1 = r10;" + "r1 += -24;" + "*(u64 *)(r10 - 8) = r1;" + "r1 = r10;" + "r1 += -32;" + "*(u64 *)(r10 - 16) = r1;" + /* r1 = fp-8 but arg tracking sees off_cnt == 0 */ + "r1 = r10;" + "r2 = -8;" + "r1 += r2;" + /* store immediate through imprecise r1 */ + "*(u64 *)(r1 + 0) = 0;" + /* read back fp-16 */ + "r0 = *(u64 *)(r10 - 16);" + /* deref: should produce use */ + "r0 = *(u64 *)(r0 + 0);" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +/* + * Test that spilling through an ARG_IMPRECISE pointer joins with + * existing at_stack values. Subprog receives r1 = fp0-24 and + * r2 = map_value, creates an ARG_IMPRECISE pointer by joining caller + * and callee FP on two branches. + * + * Setup: callee spills &fp1-16 to fp1-8 (precise, tracked). + * Then writes map_value through ARG_IMPRECISE r1 — on path A + * this hits fp1-8, on path B it hits caller stack. + * Since spill_to_stack is skipped for ARG_IMPRECISE dst, + * fp1-8 tracking isn't joined with none. + * + * Expected after the imprecise write: + * - arg tracking should show fp1-8 = fp1-16|fp1+0 (joined with none) + * - read from fp1-8 and deref should produce use for fp1-16 + * - write through it should NOT produce def for fp1-16 + */ +SEC("socket") +__log_level(2) +__success +__msg("26: (79) r0 = *(u64 *)(r10 -8) // r1=IMP3 r6=fp0-24 r7=fp1-16 fp-8=fp1-16|fp1+0") +__naked void imprecise_dst_spill_join(void) +{ + asm volatile ( + "*(u64 *)(r10 - 24) = 0;" + /* map lookup for a valid non-FP pointer */ + "*(u32 *)(r10 - 32) = 0;" + "r1 = %[map] ll;" + "r2 = r10;" + "r2 += -32;" + "call %[bpf_map_lookup_elem];" + "if r0 == 0 goto 1f;" + /* r1 = &caller_fp-24, r2 = map_value */ + "r1 = r10;" + "r1 += -24;" + "r2 = r0;" + "call imprecise_dst_spill_join_sub;" +"1:" + "r0 = 0;" + "exit;" + :: __imm_addr(map), + __imm(bpf_map_lookup_elem) + : __clobber_all); +} + +static __used __naked void imprecise_dst_spill_join_sub(void) +{ + asm volatile ( + /* r6 = &caller_fp-24 (frame=0), r8 = map_value */ + "r6 = r1;" + "r8 = r2;" + /* spill &fp1-16 to fp1-8: at_stack[0] = fp1-16 */ + "*(u64 *)(r10 - 16) = 0;" + "r7 = r10;" + "r7 += -16;" + "*(u64 *)(r10 - 8) = r7;" + /* branch to create ARG_IMPRECISE pointer */ + "call %[bpf_get_prandom_u32];" + /* path B: r1 = caller fp-24 (frame=0) */ + "r1 = r6;" + "if r0 == 0 goto 1f;" + /* path A: r1 = callee fp-8 (frame=1) */ + "r1 = r10;" + "r1 += -8;" +"1:" + /* r1 = ARG_IMPRECISE{mask=BIT(0)|BIT(1)}. + * Write map_value (non-FP) through r1. On path A this overwrites fp1-8. + * Should join at_stack[0] with none: fp1-16|fp1+0. + */ + "*(u64 *)(r1 + 0) = r8;" + /* read fp1-8: should be fp1-16|fp1+0 (joined) */ + "r0 = *(u64 *)(r10 - 8);" + "*(u64 *)(r0 + 0) = 42;" + "r0 = 0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} From 48d83d94930eb4db4c93d2de44838b9455cff626 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Apr 2026 14:14:02 +0200 Subject: [PATCH 04/32] bpf, arm64: Reject out-of-range B.cond targets aarch64_insn_gen_cond_branch_imm() calls label_imm_common() to compute a 19-bit signed byte offset for a conditional branch, but unlike its siblings aarch64_insn_gen_branch_imm() and aarch64_insn_gen_comp_branch_imm(), it does not check whether label_imm_common() returned its out-of-range sentinel (range) before feeding the value to aarch64_insn_encode_immediate(). aarch64_insn_encode_immediate() unconditionally masks the value with the 19-bit field mask, so an offset that was rejected by label_imm_common() gets silently truncated. With the sentinel value SZ_1M, the resulting field ends up with bit 18 (the sign bit of the 19-bit signed displacement) set, and the CPU decodes it as a ~1 MiB *backward* branch, producing an incorrectly targeted B.cond instruction. For code-gen locations like the emit_bpf_tail_call() this function is the only barrier between an overflowing displacement and a silently miscompiled branch. Fix it by returning AARCH64_BREAK_FAULT when the offset is out of range, so callers see a loud failure instead of a silently misencoded branch. validate_code() scans the generated image for any AARCH64_BREAK_FAULT and then lets the JIT fail. Fixes: 345e0d35ecdd ("arm64: introduce aarch64_insn_gen_cond_branch_imm()") Fixes: c94ae4f7c5ec ("arm64: insn: remove BUG_ON from codegen") Signed-off-by: Daniel Borkmann Reviewed-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260415121403.639619-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- arch/arm64/lib/insn.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index cc5b40917d0d..37ce75f7f1f0 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -338,6 +338,8 @@ u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr, long offset; offset = label_imm_common(pc, addr, SZ_1M); + if (offset >= SZ_1M) + return AARCH64_BREAK_FAULT; insn = aarch64_insn_get_bcond_value(); From 1dd8be4ec722ce54e4cace59f3a4ba658111b3ec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Apr 2026 14:14:03 +0200 Subject: [PATCH 05/32] bpf, arm64: Fix off-by-one in check_imm signed range check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit check_imm(bits, imm) is used in the arm64 BPF JIT to verify that a branch displacement (in arm64 instruction units) fits into the signed N-bit immediate field of a B, B.cond or CBZ/CBNZ encoding before it is handed to the encoder. The macro currently tests for (imm > 0 && imm >> bits) || (imm < 0 && ~imm >> bits) which admits values in [-2^N, 2^N) — effectively a signed (N+1)-bit range. A signed N-bit field only holds [-2^(N-1), 2^(N-1)), so the check admits one extra bit of range on each side. In particular, for check_imm19(), values in [2^18, 2^19) slip past the check but do not fit into the 19-bit signed imm19 field of B.cond. aarch64_insn_encode_immediate() then masks the raw value into the 19-bit field, setting bit 18 (the sign bit) and flipping a forward branch into a backward one. Same class of issue exists for check_imm26() and the B/BL encoding. Shift by (bits - 1) instead of bits so the actual signed N-bit range is enforced. Fixes: e54bcde3d69d ("arm64: eBPF JIT compiler") Signed-off-by: Daniel Borkmann Reviewed-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260415121403.639619-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index adf84962d579..4aad9483f8a5 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -35,8 +35,8 @@ #define ARENA_VM_START (MAX_BPF_JIT_REG + 5) #define check_imm(bits, imm) do { \ - if ((((imm) > 0) && ((imm) >> (bits))) || \ - (((imm) < 0) && (~(imm) >> (bits)))) { \ + if ((((imm) > 0) && ((imm) >> ((bits) - 1))) || \ + (((imm) < 0) && (~(imm) >> ((bits) - 1)))) { \ pr_info("[%2d] imm=%d(0x%x) out of range\n", \ i, imm, imm); \ return -EINVAL; \ From 4fddde2a732de60bb97e3307d4eb69ac5f1d2b74 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 13 Apr 2026 12:42:45 -0700 Subject: [PATCH 06/32] bpf: Fix use-after-free in arena_vm_close on fork arena_vm_open() only bumps vml->mmap_count but never registers the child VMA in arena->vma_list. The vml->vma always points at the parent VMA, so after parent munmap the pointer dangles. If the child then calls bpf_arena_free_pages(), zap_pages() reads the stale vml->vma triggering use-after-free. Fix this by preventing the arena VMA from being inherited across fork with VM_DONTCOPY, and preventing VMA splits via the may_split callback. Also reject mremap with a .mremap callback returning -EINVAL. A same-size mremap(MREMAP_FIXED) on the full arena VMA reaches copy_vma() through the following path: check_prep_vma() - returns 0 early: new_len == old_len skips VM_DONTEXPAND check prep_move_vma() - vm_start == old_addr and vm_end == old_addr + old_len so may_split is never called move_vma() copy_vma_and_data() copy_vma() vm_area_dup() - copies vm_private_data (vml pointer) vm_ops->open() - bumps vml->mmap_count vm_ops->mremap() - returns -EINVAL, rollback unmaps new VMA The refcount ensures the rollback's arena_vm_close does not free the vml shared with the original VMA. Reported-by: Weiming Shi Reported-by: Xiang Mei Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260413194245.21449-1-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index f355cf1c1a16..9c68c9b0b24a 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -341,6 +341,16 @@ static void arena_vm_open(struct vm_area_struct *vma) refcount_inc(&vml->mmap_count); } +static int arena_vm_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + return -EINVAL; +} + +static int arena_vm_mremap(struct vm_area_struct *vma) +{ + return -EINVAL; +} + static void arena_vm_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; @@ -417,6 +427,8 @@ out_unlock_sigsegv: static const struct vm_operations_struct arena_vm_ops = { .open = arena_vm_open, + .may_split = arena_vm_may_split, + .mremap = arena_vm_mremap, .close = arena_vm_close, .fault = arena_vm_fault, }; @@ -486,10 +498,11 @@ static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) arena->user_vm_end = vma->vm_end; /* * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and - * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid - * potential change of user_vm_start. + * clears VM_MAYEXEC. Set VM_DONTEXPAND to avoid potential change + * of user_vm_start. Set VM_DONTCOPY to prevent arena VMA from + * being copied into the child process on fork. */ - vm_flags_set(vma, VM_DONTEXPAND); + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); vma->vm_ops = &arena_vm_ops; return 0; } From 42f18ae53011826cfd3c84d041817e7f07bc645b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 13 Apr 2026 12:11:08 -0700 Subject: [PATCH 07/32] bpf, arm64: Remove redundant bpf_flush_icache() after pack allocator finalize bpf_flush_icache() calls flush_icache_range() to clean the data cache and invalidate the instruction cache for the JITed code region. However, since commit 1dad391daef1 ("bpf, arm64: use bpf_prog_pack for memory management"), this flush is redundant. bpf_jit_binary_pack_finalize() copies the JITed instructions to the ROX region via bpf_arch_text_copy() -> aarch64_insn_copy() -> __text_poke(), and __text_poke() already calls flush_icache_range() on the written range. The subsequent bpf_flush_icache() repeats the same cache maintenance on an overlapping range, including an unnecessary second synchronous IPI to all CPUs via kick_all_cpus_sync(). Remove the redundant bpf_flush_icache() call and its now-unused definition. Fixes: 1dad391daef1 ("bpf, arm64: use bpf_prog_pack for memory management") Acked-by: Song Liu Signed-off-by: Puranjay Mohan Acked-by: Breno Leitao Link: https://lore.kernel.org/r/20260413191111.3426023-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 4aad9483f8a5..524b67c0867e 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -1961,11 +1960,6 @@ static int validate_ctx(struct jit_ctx *ctx) return 0; } -static inline void bpf_flush_icache(void *start, void *end) -{ - flush_icache_range((unsigned long)start, (unsigned long)end); -} - static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size) { int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; @@ -2204,12 +2198,6 @@ skip_init_ctx: prog = orig_prog; goto out_off; } - /* - * The instructions have now been copied to the ROX region from - * where they will execute. Now the data cache has to be cleaned to - * the PoU and the I-cache has to be invalidated for the VAs. - */ - bpf_flush_icache(ro_header, ctx.ro_image + ctx.idx); } else { jit_data->ctx = ctx; jit_data->ro_image = ro_image_ptr; From 46ee1342b887c9387a933397d846ff6c9584322c Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 13 Apr 2026 12:11:09 -0700 Subject: [PATCH 08/32] bpf, riscv: Remove redundant bpf_flush_icache() after pack allocator finalize bpf_flush_icache() calls flush_icache_range() to clean the data cache and invalidate the instruction cache for the JITed code region. However, since commit 48a8f78c50bd ("bpf, riscv: use prog pack allocator in the BPF JIT"), this flush is redundant. bpf_jit_binary_pack_finalize() copies the JITed instructions to the ROX region via bpf_arch_text_copy() -> patch_text_nosync(), and patch_text_nosync() already calls flush_icache_range() on the written range. The subsequent bpf_flush_icache() repeats the same cache maintenance on an overlapping range. Remove the redundant bpf_flush_icache() call and its now-unused definition. Fixes: 48a8f78c50bd ("bpf, riscv: use prog pack allocator in the BPF JIT") Acked-by: Song Liu Signed-off-by: Puranjay Mohan Reviewed-by: Pu Lehui Tested-by: Paul Chaignon Link: https://lore.kernel.org/r/20260413191111.3426023-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit.h | 6 ------ arch/riscv/net/bpf_jit_core.c | 7 ------- 2 files changed, 13 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 632ced07bca4..da0271790244 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -11,7 +11,6 @@ #include #include -#include /* verify runtime detection extension status */ #define rv_ext_enabled(ext) \ @@ -105,11 +104,6 @@ static inline void bpf_fill_ill_insns(void *area, unsigned int size) memset(area, 0, size); } -static inline void bpf_flush_icache(void *start, void *end) -{ - flush_icache_range((unsigned long)start, (unsigned long)end); -} - /* Emit a 4-byte riscv instruction. */ static inline void emit(const u32 insn, struct rv_jit_context *ctx) { diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index b3581e926436..f7fd4afc3ca3 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -183,13 +183,6 @@ skip_init_ctx: prog = orig_prog; goto out_offset; } - /* - * The instructions have now been copied to the ROX region from - * where they will execute. - * Write any modified data cache blocks out to memory and - * invalidate the corresponding blocks in the instruction cache. - */ - bpf_flush_icache(jit_data->ro_header, ctx->ro_insns + ctx->ninsns); for (i = 0; i < prog->len; i++) ctx->offset[i] = ninsns_rvoff(ctx->offset[i]); bpf_prog_fill_jited_linfo(prog, ctx->offset); From 36bf7beb9d23bfe7feba6f376a0c13ed7b670cf8 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 13 Apr 2026 12:02:57 -0700 Subject: [PATCH 09/32] selftests/bpf: Prevent allocating data larger than a page Fix a bug in the task local data library that may allocate more than a a page for tld_data_u. This may happen when users set a too large TLD_DYN_DATA_SIZE, so check it when creating dynamic TLD fields and fix the corresponding selftest. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260413190259.358442-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/task_local_data.h | 3 ++- .../bpf/prog_tests/test_task_local_data.c | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 1e5c67c78ffb..489f07045c9f 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -241,7 +241,8 @@ retry: * TLD_DYN_DATA_SIZE is allocated for tld_create_key() */ if (dyn_data) { - if (off + TLD_ROUND_UP(size, 8) > tld_meta_p->size) + if (off + TLD_ROUND_UP(size, 8) > tld_meta_p->size || + tld_meta_p->size > TLD_PAGE_SIZE - sizeof(struct tld_data_u)) return (tld_key_t){-E2BIG}; } else { if (off + TLD_ROUND_UP(size, 8) > TLD_PAGE_SIZE - sizeof(struct tld_data_u)) diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index e219ff506b56..8b99b4880d24 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -3,8 +3,14 @@ #include #include +/* + * Only a page is pinned to kernel, so the maximum amount of dynamic data + * allowed is page_size - sizeof(struct tld_data_u) - static TLD fields. + */ +#define TLD_DYN_DATA_SIZE_MAX (getpagesize() - sizeof(struct tld_data_u) - 8) + #define TLD_FREE_DATA_ON_THREAD_EXIT -#define TLD_DYN_DATA_SIZE (getpagesize() - 8) +#define TLD_DYN_DATA_SIZE TLD_DYN_DATA_SIZE_MAX #include "task_local_data.h" struct test_tld_struct { @@ -147,11 +153,13 @@ static void test_task_local_data_basic(void) /* * Shouldn't be able to store data exceed a page. Create a TLD just big - * enough to exceed a page. TLDs already created are int value0, int - * value1, and struct test_tld_struct value2. + * enough to exceed a page. Data already contains struct tld_data_u, + * value0 and value1 of int type, and value 2 of struct test_tld_struct. */ - key = tld_create_key("value_not_exist", - TLD_PAGE_SIZE - 2 * sizeof(int) - sizeof(struct test_tld_struct) + 1); + key = tld_create_key("value_not_exist", TLD_PAGE_SIZE + 1 - + sizeof(struct tld_data_u) - + TLD_ROUND_UP(sizeof(int), 8) * 2 - + TLD_ROUND_UP(sizeof(struct test_tld_struct), 8)); ASSERT_EQ(tld_key_err_or_zero(key), -E2BIG, "tld_create_key"); key = tld_create_key("value2", sizeof(struct test_tld_struct)); From 615e55a2418405b628921e0596ac50317fd04474 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 13 Apr 2026 12:02:58 -0700 Subject: [PATCH 10/32] selftests/bpf: Fix tld_get_data() returning garbage data BPF side tld_get_data() currently may return garbage when tld_data_u is not aligned to page_size. This can happen when small amount of memory is allocated for tld_data_u. The misalignment is supposed to be allowed and the BPF side will use tld_data_u->start to reference the tld_data_u in a page. However, since "start" is within tld_data_u, there is no way to know the correct "start" in the first place. As a result, BPF programs will see garbage data. The selftest did not catch this since it tries to allocate the maximum amount of data possible (i.e., a page) such that tld_data_u->start is always correct. Fix it by moving tld_data_u->start to tld_data_map->start. The original field is now renamed as unused instead of removing it because BPF side tld_get_data() views off = 0 returned from tld_fetch_key() as uninitialized. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260413190259.358442-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/task_local_data.h | 10 ++++++++-- .../testing/selftests/bpf/progs/task_local_data.bpf.h | 5 +++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 489f07045c9f..8ae4fb2027f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -99,14 +99,20 @@ struct tld_meta_u { struct tld_metadata metadata[]; }; +/* + * The unused field ensures map_val.start > 0. On the BPF side, __tld_fetch_key() + * calculates off by summing map_val.start and tld_key_t.off and treats off == 0 + * as key not cached. + */ struct tld_data_u { - __u64 start; /* offset of tld_data_u->data in a page */ + __u64 unused; char data[] __attribute__((aligned(8))); }; struct tld_map_value { void *data; struct tld_meta_u *meta; + __u16 start; /* offset of tld_data_u->data in a page */ }; struct tld_meta_u * _Atomic tld_meta_p __attribute__((weak)); @@ -182,7 +188,7 @@ static int __tld_init_data_p(int map_fd) * is a page in BTF. */ map_val.data = (void *)(TLD_PAGE_MASK & (intptr_t)data); - data->start = (~TLD_PAGE_MASK & (intptr_t)data) + sizeof(struct tld_data_u); + map_val.start = (~TLD_PAGE_MASK & (intptr_t)data) + sizeof(struct tld_data_u); map_val.meta = tld_meta_p; err = bpf_map_update_elem(map_fd, &tid_fd, &map_val, 0); diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h index 1f396711f487..0df8a12fd61e 100644 --- a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -86,13 +86,14 @@ struct tld_meta_u { }; struct tld_data_u { - __u64 start; /* offset of tld_data_u->data in a page */ + __u64 unused; char data[__PAGE_SIZE - sizeof(__u64)] __attribute__((aligned(8))); }; struct tld_map_value { struct tld_data_u __uptr *data; struct tld_meta_u __uptr *meta; + __u16 start; /* offset of tld_data_u->data in a page */ }; typedef struct tld_uptr_dummy { @@ -176,7 +177,7 @@ static int __tld_fetch_key(struct tld_object *tld_obj, const char *name, int i_s if (!tld_obj->data_map || !tld_obj->data_map->data || !tld_obj->data_map->meta) return 0; - start = tld_obj->data_map->data->start; + start = tld_obj->data_map->start; cnt = tld_obj->data_map->meta->cnt; metadata = tld_obj->data_map->meta->metadata; From b4b0233730d5b2cdb170f6f5f183bfb1047b6dfa Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 13 Apr 2026 12:02:59 -0700 Subject: [PATCH 11/32] selftests/bpf: Test small task local data allocation Make sure task local data is working correctly for different allocation sizes. Existing task local data selftests allocate the maximum amount of data possible but miss the garbage data issue when only small amount of data is allocated. Therefore, test small data allocations as well. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260413190259.358442-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/test_task_local_data.c | 78 ++++++++++++++++++- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index 8b99b4880d24..6a5806b36113 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -30,12 +30,12 @@ TLD_DEFINE_KEY(value0_key, "value0", sizeof(int)); * sequentially. Users of task local data library should not touch * library internal. */ -static void reset_tld(void) +static void reset_tld(__u16 dyn_data_size) { if (tld_meta_p) { /* Remove TLDs created by tld_create_key() */ tld_meta_p->cnt = 1; - tld_meta_p->size = TLD_DYN_DATA_SIZE; + tld_meta_p->size = dyn_data_size + 8; memset(&tld_meta_p->metadata[1], 0, (TLD_MAX_DATA_CNT - 1) * sizeof(struct tld_metadata)); } @@ -133,7 +133,7 @@ static void test_task_local_data_basic(void) tld_key_t key; int i, err; - reset_tld(); + reset_tld(TLD_DYN_DATA_SIZE_MAX); ASSERT_OK(pthread_mutex_init(&global_mutex, NULL), "pthread_mutex_init"); @@ -247,7 +247,7 @@ static void test_task_local_data_race(void) tld_keys[0] = value0_key; for (j = 0; j < 100; j++) { - reset_tld(); + reset_tld(TLD_DYN_DATA_SIZE_MAX); for (i = 0; i < TEST_RACE_THREAD_NUM; i++) { /* @@ -296,10 +296,80 @@ out: test_task_local_data__destroy(skel); } +static void test_task_local_data_dyn_size(__u16 dyn_data_size) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct test_task_local_data *skel; + int max_keys, i, err, fd, *data; + char name[TLD_NAME_LEN]; + tld_key_t key; + + reset_tld(dyn_data_size); + + skel = test_task_local_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + tld_keys = calloc(TLD_MAX_DATA_CNT, sizeof(tld_key_t)); + if (!ASSERT_OK_PTR(tld_keys, "calloc tld_keys")) + goto out; + + fd = bpf_map__fd(skel->maps.tld_data_map); + + /* Create as many int-sized TLDs as the dynamic data size allows */ + max_keys = dyn_data_size / TLD_ROUND_UP(sizeof(int), 8); + for (i = 0; i < max_keys; i++) { + snprintf(name, TLD_NAME_LEN, "value_%d", i); + tld_keys[i] = tld_create_key(name, sizeof(int)); + if (!ASSERT_FALSE(tld_key_is_err(tld_keys[i]), "tld_create_key")) + goto out; + + data = tld_get_data(fd, tld_keys[i]); + if (!ASSERT_OK_PTR(data, "tld_get_data")) + goto out; + *data = i; + } + + /* The next key should fail with E2BIG */ + key = tld_create_key("overflow", sizeof(int)); + ASSERT_EQ(tld_key_err_or_zero(key), -E2BIG, "tld_create_key overflow"); + + /* Verify data for value_i do not overlap */ + for (i = 0; i < max_keys; i++) { + data = tld_get_data(fd, tld_keys[i]); + if (!ASSERT_OK_PTR(data, "tld_get_data")) + goto out; + + ASSERT_EQ(*data, i, "tld_get_data value_i"); + } + + /* Verify BPF side can still read the static key */ + data = tld_get_data(fd, value0_key); + if (!ASSERT_OK_PTR(data, "tld_get_data value0")) + goto out; + *data = 0xdeadbeef; + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.task_main), &opts); + ASSERT_OK(err, "run task_main"); + ASSERT_EQ(skel->bss->test_value0, 0xdeadbeef, "tld_get_data value0"); + +out: + if (tld_keys) { + free(tld_keys); + tld_keys = NULL; + } + tld_free(); + test_task_local_data__destroy(skel); +} + void test_task_local_data(void) { if (test__start_subtest("task_local_data_basic")) test_task_local_data_basic(); if (test__start_subtest("task_local_data_race")) test_task_local_data_race(); + if (test__start_subtest("task_local_data_dyn_size_small")) + test_task_local_data_dyn_size(64); + if (test__start_subtest("task_local_data_dyn_size_zero")) + test_task_local_data_dyn_size(0); } From 0251e40c48299243c12f7cf4a6046f080af206cb Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 15 Apr 2026 13:03:55 -0700 Subject: [PATCH 12/32] bpf: copy BPF token from main program to subprograms bpf_jit_subprogs() copies various fields from the main program's aux to each subprogram's aux, but omits the BPF token. This causes bpf_prog_kallsyms_add() to fail for subprograms loaded via BPF token, as bpf_token_capable() falls back to capable() in init_user_ns when token is NULL. Copy prog->aux->token to func[i]->aux->token so that subprograms inherit the same capability delegation as the main program. Fixes: d79a35497547 ("bpf: Consistently use BPF token throughout BPF verifier logic") Signed-off-by: Tao Chen Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260415-subprog-token-fix-v4-1-9bd000e8b068@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/fixups.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 67c9b28767e1..dd00a680e4ea 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1110,6 +1110,7 @@ int bpf_jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; + func[i]->aux->token = prog->aux->token; if (!i) func[i]->aux->exception_boundary = env->seen_exception; From 969fb456ffb43d87894a295dbe6a0a722691552a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 15 Apr 2026 13:03:56 -0700 Subject: [PATCH 13/32] selftests/bpf: verify kallsyms entries for token-loaded subprograms Add a test that loads an XDP program with a global subprogram using a BPF token from a user namespace, then verifies that both the main program and the subprogram appear in /proc/kallsyms. This exercises the bpf_prog_kallsyms_add() path for subprograms and would have caught the missing aux->token copy in bpf_jit_subprogs(). load_kallsyms_local() filters out kallsyms with zero addresses. For a process with limited capabilities to read kallsym addresses the following sysctl variables have to be set to zero: - /proc/sys/kernel/perf_event_paranoid - /proc/sys/kernel/kptr_restrict Set these variables using sysctl_set() utility function extracted from unpriv_bpf_disabled.c to a separate c/header. Since the test modifies global system state, mark it as serial. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260415-subprog-token-fix-v4-2-9bd000e8b068@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 + .../testing/selftests/bpf/prog_tests/token.c | 86 ++++++++++++++++++- .../bpf/prog_tests/unpriv_bpf_disabled.c | 21 +---- .../selftests/bpf/progs/token_kallsyms.c | 19 ++++ tools/testing/selftests/bpf/sysctl_helpers.c | 37 ++++++++ tools/testing/selftests/bpf/sysctl_helpers.h | 8 ++ 6 files changed, 149 insertions(+), 23 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/token_kallsyms.c create mode 100644 tools/testing/selftests/bpf/sysctl_helpers.c create mode 100644 tools/testing/selftests/bpf/sysctl_helpers.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 78e60040811e..6ef6872adbc3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -751,6 +751,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ btf_helpers.c \ cap_helpers.c \ unpriv_helpers.c \ + sysctl_helpers.c \ netlink_helpers.c \ jit_disasm_helpers.c \ io_helpers.c \ diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c index b81dde283052..f2f5d36ae00a 100644 --- a/tools/testing/selftests/bpf/prog_tests/token.c +++ b/tools/testing/selftests/bpf/prog_tests/token.c @@ -1,9 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ #define _GNU_SOURCE -#include #include -#include "cap_helpers.h" #include #include #include @@ -15,9 +13,17 @@ #include #include #include + +#include "bpf_util.h" +#include "cap_helpers.h" +#include "sysctl_helpers.h" +#include "test_progs.h" +#include "trace_helpers.h" + #include "priv_map.skel.h" #include "priv_prog.skel.h" #include "dummy_st_ops_success.skel.h" +#include "token_kallsyms.skel.h" #include "token_lsm.skel.h" #include "priv_freplace_prog.skel.h" @@ -1045,6 +1051,58 @@ err_out: return -EINVAL; } +static bool kallsyms_has_bpf_func(struct ksyms *ksyms, const char *func_name) +{ + char name[256]; + int i; + + for (i = 0; i < ksyms->sym_cnt; i++) { + if (sscanf(ksyms->syms[i].name, "bpf_prog_%*[^_]_%255s", name) == 1 && + strcmp(name, func_name) == 0) + return true; + } + return false; +} + +static int userns_obj_priv_prog_kallsyms(int mnt_fd, struct token_lsm *lsm_skel) +{ + const char *func_names[] = { "xdp_main", "token_ksym_subprog" }; + LIBBPF_OPTS(bpf_object_open_opts, opts); + struct token_kallsyms *skel; + struct ksyms *ksyms = NULL; + char buf[256]; + int i, err; + + snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd); + opts.bpf_token_path = buf; + skel = token_kallsyms__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "token_kallsyms__open_opts")) + return -EINVAL; + + err = token_kallsyms__load(skel); + if (!ASSERT_OK(err, "token_kallsyms__load")) + goto cleanup; + + ksyms = load_kallsyms_local(); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_local")) { + err = -EINVAL; + goto cleanup; + } + + for (i = 0; i < ARRAY_SIZE(func_names); i++) { + if (!ASSERT_TRUE(kallsyms_has_bpf_func(ksyms, func_names[i]), + func_names[i])) { + err = -EINVAL; + break; + } + } + +cleanup: + free_kallsyms_local(ksyms); + token_kallsyms__destroy(skel); + return err; +} + #define bit(n) (1ULL << (n)) static int userns_bpf_token_info(int mnt_fd, struct token_lsm *lsm_skel) @@ -1082,7 +1140,7 @@ cleanup: return err; } -void test_token(void) +void serial_test_token(void) { if (test__start_subtest("map_token")) { struct bpffs_opts opts = { @@ -1194,4 +1252,26 @@ void test_token(void) subtest_userns(&opts, userns_bpf_token_info); } + if (test__start_subtest("obj_priv_prog_kallsyms")) { + char perf_paranoid_orig[32] = {}; + char kptr_restrict_orig[32] = {}; + struct bpffs_opts opts = { + .cmds = bit(BPF_BTF_LOAD) | bit(BPF_PROG_LOAD), + .progs = bit(BPF_PROG_TYPE_XDP), + .attachs = ~0ULL, + }; + + if (sysctl_set_or_fail("/proc/sys/kernel/perf_event_paranoid", perf_paranoid_orig, "0")) + goto cleanup; + if (sysctl_set_or_fail("/proc/sys/kernel/kptr_restrict", kptr_restrict_orig, "0")) + goto cleanup; + + subtest_userns(&opts, userns_obj_priv_prog_kallsyms); + +cleanup: + if (perf_paranoid_orig[0]) + sysctl_set_or_fail("/proc/sys/kernel/perf_event_paranoid", NULL, perf_paranoid_orig); + if (kptr_restrict_orig[0]) + sysctl_set_or_fail("/proc/sys/kernel/kptr_restrict", NULL, kptr_restrict_orig); + } } diff --git a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c index 472f4f9fa95f..64404602b9ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c +++ b/tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c @@ -8,6 +8,7 @@ #include "cap_helpers.h" #include "bpf_util.h" +#include "sysctl_helpers.h" /* Using CAP_LAST_CAP is risky here, since it can get pulled in from * an old /usr/include/linux/capability.h and be < CAP_BPF; as a result @@ -36,26 +37,6 @@ static void process_perfbuf(void *ctx, int cpu, void *data, __u32 len) got_perfbuf_val = *(__u32 *)data; } -static int sysctl_set(const char *sysctl_path, char *old_val, const char *new_val) -{ - int ret = 0; - FILE *fp; - - fp = fopen(sysctl_path, "r+"); - if (!fp) - return -errno; - if (old_val && fscanf(fp, "%s", old_val) <= 0) { - ret = -ENOENT; - } else if (!old_val || strcmp(old_val, new_val) != 0) { - fseek(fp, 0, SEEK_SET); - if (fprintf(fp, "%s", new_val) < 0) - ret = -errno; - } - fclose(fp); - - return ret; -} - static void test_unpriv_bpf_disabled_positive(struct test_unpriv_bpf_disabled *skel, __u32 prog_id, int prog_fd, int perf_fd, char **map_paths, int *map_fds) diff --git a/tools/testing/selftests/bpf/progs/token_kallsyms.c b/tools/testing/selftests/bpf/progs/token_kallsyms.c new file mode 100644 index 000000000000..c9f9344f3eb2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/token_kallsyms.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +__weak +int token_ksym_subprog(void) +{ + return 0; +} + +SEC("xdp") +int xdp_main(struct xdp_md *xdp) +{ + return token_ksym_subprog(); +} diff --git a/tools/testing/selftests/bpf/sysctl_helpers.c b/tools/testing/selftests/bpf/sysctl_helpers.c new file mode 100644 index 000000000000..e2bd824f12d5 --- /dev/null +++ b/tools/testing/selftests/bpf/sysctl_helpers.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "sysctl_helpers.h" +#include "test_progs.h" + +int sysctl_set(const char *sysctl_path, char *old_val, const char *new_val) +{ + int ret = 0; + FILE *fp; + + fp = fopen(sysctl_path, "r+"); + if (!fp) + return -errno; + if (old_val && fscanf(fp, "%s", old_val) <= 0) { + ret = -ENOENT; + } else if (!old_val || strcmp(old_val, new_val) != 0) { + fseek(fp, 0, SEEK_SET); + if (fprintf(fp, "%s", new_val) < 0) + ret = -errno; + } + fclose(fp); + + return ret; +} + +int sysctl_set_or_fail(const char *sysctl_path, char *old_val, const char *new_val) +{ + int err; + + err = sysctl_set(sysctl_path, old_val, new_val); + if (err) + PRINT_FAIL("failed to set %s to %s: %s\n", sysctl_path, new_val, strerror(-err)); + return err; +} diff --git a/tools/testing/selftests/bpf/sysctl_helpers.h b/tools/testing/selftests/bpf/sysctl_helpers.h new file mode 100644 index 000000000000..35e37bfe1b3b --- /dev/null +++ b/tools/testing/selftests/bpf/sysctl_helpers.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SYSCTL_HELPERS_H +#define __SYSCTL_HELPERS_H + +int sysctl_set(const char *sysctl_path, char *old_val, const char *new_val); +int sysctl_set_or_fail(const char *sysctl_path, char *old_val, const char *new_val); + +#endif From a25566084e391348385a72dd507e0cc0c268dd5d Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 14 Apr 2026 16:13:15 +0200 Subject: [PATCH 14/32] bpf, sockmap: Annotate af_unix sock:: Sk_state data-races sock_map_sk_state_allowed() and sock_map_redirect_allowed() read af_unix socket sk_state locklessly. Use READ_ONCE(). Note that for sock_map_redirect_allowed() change affects not only af_unix, but all non-TCP sockets (UDP, af_vsock). Suggested-by: Kuniyuki Iwashima Suggested-by: Martin KaFai Lau Signed-off-by: Michal Luczaj Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260414-unix-proto-update-null-ptr-deref-v4-1-2af6fe97918e@rbox.co --- net/core/sock_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index b0e96337a269..02a68be3002a 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -530,7 +530,7 @@ static bool sock_map_redirect_allowed(const struct sock *sk) if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else - return sk->sk_state == TCP_ESTABLISHED; + return READ_ONCE(sk->sk_state) == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) @@ -543,7 +543,7 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) - return (1 << sk->sk_state) & TCPF_ESTABLISHED; + return (1 << READ_ONCE(sk->sk_state)) & TCPF_ESTABLISHED; if (sk_is_vsock(sk) && (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; From 4d328dd695383224aa750ddee6b4ad40c0f8d205 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 14 Apr 2026 16:13:16 +0200 Subject: [PATCH 15/32] bpf, sockmap: Fix af_unix iter deadlock bpf_iter_unix_seq_show() may deadlock when lock_sock_fast() takes the fast path and the iter prog attempts to update a sockmap. Which ends up spinning at sock_map_update_elem()'s bh_lock_sock(): WARNING: possible recursive locking detected test_progs/1393 is trying to acquire lock: ffff88811ec25f58 (slock-AF_UNIX){+...}-{3:3}, at: sock_map_update_elem+0xdb/0x1f0 but task is already holding lock: ffff88811ec25f58 (slock-AF_UNIX){+...}-{3:3}, at: __lock_sock_fast+0x37/0xe0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(slock-AF_UNIX); lock(slock-AF_UNIX); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by test_progs/1393: #0: ffff88814b59c790 (&p->lock){+.+.}-{4:4}, at: bpf_seq_read+0x59/0x10d0 #1: ffff88811ec25fd8 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: bpf_seq_read+0x42c/0x10d0 #2: ffff88811ec25f58 (slock-AF_UNIX){+...}-{3:3}, at: __lock_sock_fast+0x37/0xe0 #3: ffffffff85a6a7c0 (rcu_read_lock){....}-{1:3}, at: bpf_iter_run_prog+0x51d/0xb00 Call Trace: dump_stack_lvl+0x5d/0x80 print_deadlock_bug.cold+0xc0/0xce __lock_acquire+0x130f/0x2590 lock_acquire+0x14e/0x2b0 _raw_spin_lock+0x30/0x40 sock_map_update_elem+0xdb/0x1f0 bpf_prog_2d0075e5d9b721cd_dump_unix+0x55/0x4f4 bpf_iter_run_prog+0x5b9/0xb00 bpf_iter_unix_seq_show+0x1f7/0x2e0 bpf_seq_read+0x42c/0x10d0 vfs_read+0x171/0xb20 ksys_read+0xff/0x200 do_syscall_64+0x6b/0x3a0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 2c860a43dd77 ("bpf: af_unix: Implement BPF iterator for UNIX domain socket.") Suggested-by: Kuniyuki Iwashima Suggested-by: Martin KaFai Lau Signed-off-by: Michal Luczaj Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260414-unix-proto-update-null-ptr-deref-v4-2-2af6fe97918e@rbox.co --- net/unix/af_unix.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4c4a8d23ddd2..3a041a7469ba 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3735,15 +3735,14 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) struct bpf_prog *prog; struct sock *sk = v; uid_t uid; - bool slow; int ret; if (v == SEQ_START_TOKEN) return 0; - slow = lock_sock_fast(sk); + lock_sock(sk); - if (unlikely(sk_unhashed(sk))) { + if (unlikely(sock_flag(sk, SOCK_DEAD))) { ret = SEQ_SKIP; goto unlock; } @@ -3753,7 +3752,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); unlock: - unlock_sock_fast(sk, slow); + release_sock(sk); return ret; } From 997b8483d44c60805c71a9882376a16eb176cb24 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 14 Apr 2026 16:13:17 +0200 Subject: [PATCH 16/32] selftests/bpf: Extend bpf_iter_unix to attempt deadlocking Updating a sockmap from a unix iterator prog may lead to a deadlock. Piggyback on the original selftest. Signed-off-by: Michal Luczaj Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260414-unix-proto-update-null-ptr-deref-v4-3-2af6fe97918e@rbox.co --- tools/testing/selftests/bpf/progs/bpf_iter_unix.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c index fea275df9e22..a2652c8c3616 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c @@ -7,6 +7,13 @@ char _license[] SEC("license") = "GPL"; +SEC(".maps") struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} sockmap; + static long sock_i_ino(const struct sock *sk) { const struct socket *sk_socket = sk->sk_socket; @@ -76,5 +83,8 @@ int dump_unix(struct bpf_iter__unix *ctx) BPF_SEQ_PRINTF(seq, "\n"); + /* Test for deadlock. */ + bpf_map_update_elem(&sockmap, &(int){0}, sk, 0); + return 0; } From dca38b7734d2ea00af4818ff3ae836fab33d5d5a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 14 Apr 2026 16:13:18 +0200 Subject: [PATCH 17/32] bpf, sockmap: Fix af_unix null-ptr-deref in proto update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unix_stream_connect() sets sk_state (`WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED)`) _before_ it assigns a peer (`unix_peer(sk) = newsk`). sk_state == TCP_ESTABLISHED makes sock_map_sk_state_allowed() believe that socket is properly set up, which would include having a defined peer. IOW, there's a window when unix_stream_bpf_update_proto() can be called on socket which still has unix_peer(sk) == NULL. CPU0 bpf CPU1 connect -------- ------------ WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED) sock_map_sk_state_allowed(sk) ... sk_pair = unix_peer(sk) sock_hold(sk_pair) sock_hold(newsk) smp_mb__after_atomic() unix_peer(sk) = newsk BUG: kernel NULL pointer dereference, address: 0000000000000080 RIP: 0010:unix_stream_bpf_update_proto+0xa0/0x1b0 Call Trace: sock_map_link+0x564/0x8b0 sock_map_update_common+0x6e/0x340 sock_map_update_elem_sys+0x17d/0x240 __sys_bpf+0x26db/0x3250 __x64_sys_bpf+0x21/0x30 do_syscall_64+0x6b/0x3a0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Initial idea was to move peer assignment _before_ the sk_state update[1], but that involved an additional memory barrier, and changing the hot path was rejected. Then a NULL check during proto update in unix_stream_bpf_update_proto() was considered[2], but the follow-up discussion[3] focused on the root cause, i.e. sockmap update taking a wrong lock. Or, more specifically, missing unix_state_lock()[4]. In the end it was concluded that teaching sockmap about the af_unix locking would be unnecessarily complex[5]. Complexity aside, since BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_SCHED_ACT are allowed to update sockmaps, sock_map_update_elem() taking the unix lock, as it is currently implemented in unix_state_lock(): spin_lock(&unix_sk(s)->lock), would be problematic. unix_state_lock() taken in a process context, followed by a softirq-context TC BPF program attempting to take the same spinlock -- deadlock[6]. This way we circled back to the peer check idea[2]. [1]: https://lore.kernel.org/netdev/ba5c50aa-1df4-40c2-ab33-a72022c5a32e@rbox.co/ [2]: https://lore.kernel.org/netdev/20240610174906.32921-1-kuniyu@amazon.com/ [3]: https://lore.kernel.org/netdev/7603c0e6-cd5b-452b-b710-73b64bd9de26@linux.dev/ [4]: https://lore.kernel.org/netdev/CAAVpQUA+8GL_j63CaKb8hbxoL21izD58yr1NvhOhU=j+35+3og@mail.gmail.com/ [5]: https://lore.kernel.org/bpf/CAAVpQUAHijOMext28Gi10dSLuMzGYh+jK61Ujn+fZ-wvcODR2A@mail.gmail.com/ [6]: https://lore.kernel.org/bpf/dd043c69-4d03-46fe-8325-8f97101435cf@linux.dev/ Summary of scenarios where af_unix/stream connect() may race a sockmap update: 1. connect() vs. bpf(BPF_MAP_UPDATE_ELEM), i.e. sock_map_update_elem_sys() Implemented NULL check is sufficient. Once assigned, socket peer won't be released until socket fd is released. And that's not an issue because sock_map_update_elem_sys() bumps fd refcnf. 2. connect() vs BPF program doing update Update restricted per verifier.c:may_update_sockmap() to BPF_PROG_TYPE_TRACING/BPF_TRACE_ITER BPF_PROG_TYPE_SOCK_OPS (bpf_sock_map_update() only) BPF_PROG_TYPE_SOCKET_FILTER BPF_PROG_TYPE_SCHED_CLS BPF_PROG_TYPE_SCHED_ACT BPF_PROG_TYPE_XDP BPF_PROG_TYPE_SK_REUSEPORT BPF_PROG_TYPE_FLOW_DISSECTOR BPF_PROG_TYPE_SK_LOOKUP Plus one more race to consider: CPU0 bpf CPU1 connect -------- ------------ WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED) sock_map_sk_state_allowed(sk) sock_hold(newsk) smp_mb__after_atomic() unix_peer(sk) = newsk sk_pair = unix_peer(sk) if (unlikely(!sk_pair)) return -EINVAL; CPU1 close ---------- skpair = unix_peer(sk); unix_peer(sk) = NULL; sock_put(skpair) // use after free? sock_hold(sk_pair) 2.1 BPF program invoking helper function bpf_sock_map_update() -> BPF_CALL_4(bpf_sock_map_update(), ...) Helper limited to BPF_PROG_TYPE_SOCK_OPS. Nevertheless, a unix sock might be accessible via bpf_map_lookup_elem(). Which implies sk already having psock, which in turn implies sk already having sk_pair. Since sk_psock_destroy() is queued as RCU work, sk_pair won't go away while BPF executes the update. 2.2 BPF program invoking helper function bpf_map_update_elem() -> sock_map_update_elem() 2.2.1 Unix sock accessible to BPF prog only via sockmap lookup in BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_SK_LOOKUP. Pretty much the same as case 2.1. 2.2.2 Unix sock accessible to BPF program directly: BPF_PROG_TYPE_TRACING, narrowed down to BPF_TRACE_ITER. Sockmap iterator (sock_map_seq_ops) is safe: unix sock residing in a sockmap means that the sock already went through the proto update step. Unix sock iterator (bpf_iter_unix_seq_ops), on the other hand, gives access to socks that may still be unconnected. Which means iterator prog can race sockmap/proto update against connect(). BUG: KASAN: null-ptr-deref in unix_stream_bpf_update_proto+0x253/0x4d0 Write of size 4 at addr 0000000000000080 by task test_progs/3140 Call Trace: dump_stack_lvl+0x5d/0x80 kasan_report+0xe4/0x1c0 kasan_check_range+0x125/0x200 unix_stream_bpf_update_proto+0x253/0x4d0 sock_map_link+0x71c/0xec0 sock_map_update_common+0xbc/0x600 sock_map_update_elem+0x19a/0x1f0 bpf_prog_bbbf56096cdd4f01_selective_dump_unix+0x20c/0x217 bpf_iter_run_prog+0x21e/0xae0 bpf_iter_unix_seq_show+0x1e0/0x2a0 bpf_seq_read+0x42c/0x10d0 vfs_read+0x171/0xb20 ksys_read+0xff/0x200 do_syscall_64+0xf7/0x5e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e While the introduced NULL check prevents null-ptr-deref in the BPF program path as well, it is insufficient to guard against a poorly timed close() leading to a use-after-free. This will be addressed in a subsequent patch. Fixes: c63829182c37 ("af_unix: Implement ->psock_update_sk_prot()") Closes: https://lore.kernel.org/netdev/ba5c50aa-1df4-40c2-ab33-a72022c5a32e@rbox.co/ Reported-by: Michal Luczaj Reported-by: 钱一铭 Suggested-by: Kuniyuki Iwashima Suggested-by: Martin KaFai Lau Signed-off-by: Michal Luczaj Signed-off-by: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260414-unix-proto-update-null-ptr-deref-v4-4-2af6fe97918e@rbox.co --- net/unix/unix_bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index d14cd5454a8d..f86ff19e9764 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -185,6 +185,9 @@ int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool r */ if (!psock->sk_pair) { sk_pair = unix_peer(sk); + if (unlikely(!sk_pair)) + return -EINVAL; + sock_hold(sk_pair); psock->sk_pair = sk_pair; } From 64c2f93fc3254d3bf5de4445fb732ee5c451edb6 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 14 Apr 2026 16:13:19 +0200 Subject: [PATCH 18/32] bpf, sockmap: Take state lock for af_unix iter When a BPF iterator program updates a sockmap, there is a race condition in unix_stream_bpf_update_proto() where the `peer` pointer can become stale[1] during a state transition TCP_ESTABLISHED -> TCP_CLOSE. CPU0 bpf CPU1 close -------- ---------- // unix_stream_bpf_update_proto() sk_pair = unix_peer(sk) if (unlikely(!sk_pair)) return -EINVAL; // unix_release_sock() skpair = unix_peer(sk); unix_peer(sk) = NULL; sock_put(skpair) sock_hold(sk_pair) // UaF More practically, this fix guarantees that the iterator program is consistently provided with a unix socket that remains stable during iterator execution. [1]: BUG: KASAN: slab-use-after-free in unix_stream_bpf_update_proto+0x155/0x490 Write of size 4 at addr ffff8881178c9a00 by task test_progs/2231 Call Trace: dump_stack_lvl+0x5d/0x80 print_report+0x170/0x4f3 kasan_report+0xe4/0x1c0 kasan_check_range+0x125/0x200 unix_stream_bpf_update_proto+0x155/0x490 sock_map_link+0x71c/0xec0 sock_map_update_common+0xbc/0x600 sock_map_update_elem+0x19a/0x1f0 bpf_prog_bbbf56096cdd4f01_selective_dump_unix+0x20c/0x217 bpf_iter_run_prog+0x21e/0xae0 bpf_iter_unix_seq_show+0x1e0/0x2a0 bpf_seq_read+0x42c/0x10d0 vfs_read+0x171/0xb20 ksys_read+0xff/0x200 do_syscall_64+0xf7/0x5e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Allocated by task 2236: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 __kasan_slab_alloc+0x63/0x80 kmem_cache_alloc_noprof+0x1d5/0x680 sk_prot_alloc+0x59/0x210 sk_alloc+0x34/0x470 unix_create1+0x86/0x8a0 unix_stream_connect+0x318/0x15b0 __sys_connect+0xfd/0x130 __x64_sys_connect+0x72/0xd0 do_syscall_64+0xf7/0x5e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 2236: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x70 __kasan_slab_free+0x47/0x70 kmem_cache_free+0x11c/0x590 __sk_destruct+0x432/0x6e0 unix_release_sock+0x9b3/0xf60 unix_release+0x8a/0xf0 __sock_release+0xb0/0x270 sock_close+0x18/0x20 __fput+0x36e/0xac0 fput_close_sync+0xe5/0x1a0 __x64_sys_close+0x7d/0xd0 do_syscall_64+0xf7/0x5e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 2c860a43dd77 ("bpf: af_unix: Implement BPF iterator for UNIX domain socket.") Suggested-by: Kuniyuki Iwashima Signed-off-by: Michal Luczaj Signed-off-by: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260414-unix-proto-update-null-ptr-deref-v4-5-2af6fe97918e@rbox.co --- net/unix/af_unix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3a041a7469ba..f668ff107722 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3741,6 +3741,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) return 0; lock_sock(sk); + unix_state_lock(sk); if (unlikely(sock_flag(sk, SOCK_DEAD))) { ret = SEQ_SKIP; @@ -3752,6 +3753,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); unlock: + unix_state_unlock(sk); release_sock(sk); return ret; } From d3e945223e0158c85dbde23de4f89493a2a817f6 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:37 +0000 Subject: [PATCH 19/32] bpf: Move constants blinding out of arch-specific JITs During the JIT stage, constants blinding rewrites instructions but only rewrites the private instruction copy of the JITed subprog, leaving the global env->prog->insnsi and env->insn_aux_data untouched. This causes a mismatch between subprog instructions and the global state, making it difficult to use the global data in the JIT. To avoid this mismatch, and given that all arch-specific JITs already support constants blinding, move it to the generic verifier code, and switch to rewrite the global env->prog->insnsi with the global states adjusted, as other rewrites in the verifier do. This removes the constants blinding calls in each JIT, which are largely duplicated code across architectures. Since constants blinding is only required for JIT, and there are two JIT entry functions, jit_subprogs() for BPF programs with multiple subprogs and bpf_prog_select_runtime() for programs with no subprogs, move the constants blinding invocation into these two functions. In the verifier path, bpf_patch_insn_data() is used to keep global verifier auxiliary data in sync with patched instructions. A key question is whether this global auxiliary data should be restored on the failure path. Besides instructions, bpf_patch_insn_data() adjusts: - prog->aux->poke_tab - env->insn_array_maps - env->subprog_info - env->insn_aux_data For prog->aux->poke_tab, it is only used by JIT or only meaningful after JIT succeeds, so it does not need to be restored on the failure path. For env->insn_array_maps, when JIT fails, programs using insn arrays are rejected by bpf_insn_array_ready() due to missing JIT addresses. Hence, env->insn_array_maps is only meaningful for JIT and does not need to be restored. For subprog_info, if jit_subprogs fails and CONFIG_BPF_JIT_ALWAYS_ON is not enabled, kernel falls back to interpreter. In this case, env->subprog_info is used to determine subprogram stack depth. So it must be restored on failure. For env->insn_aux_data, it is freed by clear_insn_aux_data() at the end of bpf_check(). Before freeing, clear_insn_aux_data() loops over env->insn_aux_data to release jump targets recorded in it. The loop uses env->prog->len as the array length, but this length no longer matches the actual size of the adjusted env->insn_aux_data array after constants blinding. To address it, a simple approach is to keep insn_aux_data as adjusted after failure, since it will be freed shortly, and record its actual size for the loop in clear_insn_aux_data(). But since clear_insn_aux_data() uses the same index to loop over both env->prog->insnsi and env->insn_aux_data, this approach results in incorrect index for the insnsi array. So an alternative approach is adopted: clone the original env->insn_aux_data before blinding and restore it after failure, similar to env->prog. For classic BPF programs, constants blinding works as before since it is still invoked from bpf_prog_select_runtime(). Reviewed-by: Anton Protopopov # v8 Reviewed-by: Hari Bathini # powerpc jit Reviewed-by: Pu Lehui # riscv jit Acked-by: Hengqi Chen # loongarch jit Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/arc/net/bpf_jit_core.c | 39 +++------ arch/arm/net/bpf_jit_32.c | 41 ++------- arch/arm64/net/bpf_jit_comp.c | 72 +++++---------- arch/loongarch/net/bpf_jit.c | 59 ++++--------- arch/mips/net/bpf_jit_comp.c | 20 +---- arch/parisc/net/bpf_jit_core.c | 73 ++++++---------- arch/powerpc/net/bpf_jit_comp.c | 72 ++++++--------- arch/riscv/net/bpf_jit_core.c | 61 +++++-------- arch/s390/net/bpf_jit_comp.c | 59 +++++-------- arch/sparc/net/bpf_jit_comp_64.c | 61 +++++-------- arch/x86/net/bpf_jit_comp.c | 43 ++------- arch/x86/net/bpf_jit_comp32.c | 33 +------ include/linux/filter.h | 33 ++++++- kernel/bpf/core.c | 69 +++++++++++++-- kernel/bpf/fixups.c | 146 ++++++++++++++++++++++++++----- 15 files changed, 403 insertions(+), 478 deletions(-) diff --git a/arch/arc/net/bpf_jit_core.c b/arch/arc/net/bpf_jit_core.c index 1421eeced0f5..973ceae48675 100644 --- a/arch/arc/net/bpf_jit_core.c +++ b/arch/arc/net/bpf_jit_core.c @@ -79,7 +79,6 @@ struct arc_jit_data { * The JIT pertinent context that is used by different functions. * * prog: The current eBPF program being handled. - * orig_prog: The original eBPF program before any possible change. * jit: The JIT buffer and its length. * bpf_header: The JITed program header. "jit.buf" points inside it. * emit: If set, opcodes are written to memory; else, a dry-run. @@ -94,12 +93,10 @@ struct arc_jit_data { * need_extra_pass: A forecast if an "extra_pass" will occur. * is_extra_pass: Indicates if the current pass is an extra pass. * user_bpf_prog: True, if VM opcodes come from a real program. - * blinded: True if "constant blinding" step returned a new "prog". * success: Indicates if the whole JIT went OK. */ struct jit_context { struct bpf_prog *prog; - struct bpf_prog *orig_prog; struct jit_buffer jit; struct bpf_binary_header *bpf_header; bool emit; @@ -114,7 +111,6 @@ struct jit_context { bool need_extra_pass; bool is_extra_pass; bool user_bpf_prog; - bool blinded; bool success; }; @@ -161,13 +157,7 @@ static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) { memset(ctx, 0, sizeof(*ctx)); - ctx->orig_prog = prog; - - /* If constant blinding was requested but failed, scram. */ - ctx->prog = bpf_jit_blind_constants(prog); - if (IS_ERR(ctx->prog)) - return PTR_ERR(ctx->prog); - ctx->blinded = (ctx->prog != ctx->orig_prog); + ctx->prog = prog; /* If the verifier doesn't zero-extend, then we have to do it. */ ctx->do_zext = !ctx->prog->aux->verifier_zext; @@ -214,14 +204,6 @@ static inline void maybe_free(struct jit_context *ctx, void **mem) */ static void jit_ctx_cleanup(struct jit_context *ctx) { - if (ctx->blinded) { - /* if all went well, release the orig_prog. */ - if (ctx->success) - bpf_jit_prog_release_other(ctx->prog, ctx->orig_prog); - else - bpf_jit_prog_release_other(ctx->orig_prog, ctx->prog); - } - maybe_free(ctx, (void **)&ctx->bpf2insn); maybe_free(ctx, (void **)&ctx->jit_data); @@ -229,12 +211,19 @@ static void jit_ctx_cleanup(struct jit_context *ctx) ctx->bpf2insn_valid = false; /* Freeing "bpf_header" is enough. "jit.buf" is a sub-array of it. */ - if (!ctx->success && ctx->bpf_header) { - bpf_jit_binary_free(ctx->bpf_header); - ctx->bpf_header = NULL; - ctx->jit.buf = NULL; - ctx->jit.index = 0; - ctx->jit.len = 0; + if (!ctx->success) { + if (ctx->bpf_header) { + bpf_jit_binary_free(ctx->bpf_header); + ctx->bpf_header = NULL; + ctx->jit.buf = NULL; + ctx->jit.index = 0; + ctx->jit.len = 0; + } + if (ctx->is_extra_pass) { + ctx->prog->bpf_func = NULL; + ctx->prog->jited = 0; + ctx->prog->jited_len = 0; + } } ctx->emit = false; diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index deeb8f292454..e6b1bb2de627 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -2144,9 +2144,7 @@ bool bpf_jit_needs_zext(void) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { - struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; - bool tmp_blinded = false; struct jit_ctx ctx; unsigned int tmp_idx; unsigned int image_size; @@ -2156,20 +2154,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * the interpreter. */ if (!prog->jit_requested) - return orig_prog; - - /* If constant blinding was enabled and we failed during blinding - * then we must fall back to the interpreter. Otherwise, we save - * the new JITed code. - */ - tmp = bpf_jit_blind_constants(prog); - - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; memset(&ctx, 0, sizeof(ctx)); ctx.prog = prog; @@ -2179,10 +2164,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * we must fall back to the interpreter */ ctx.offsets = kcalloc(prog->len, sizeof(int), GFP_KERNEL); - if (ctx.offsets == NULL) { - prog = orig_prog; - goto out; - } + if (ctx.offsets == NULL) + return prog; /* 1) fake pass to find in the length of the JITed code, * to compute ctx->offsets and other context variables @@ -2194,10 +2177,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * being successful in the second pass, so just fall back * to the interpreter. */ - if (build_body(&ctx)) { - prog = orig_prog; + if (build_body(&ctx)) goto out_off; - } tmp_idx = ctx.idx; build_prologue(&ctx); @@ -2213,10 +2194,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.idx += ctx.imm_count; if (ctx.imm_count) { ctx.imms = kcalloc(ctx.imm_count, sizeof(u32), GFP_KERNEL); - if (ctx.imms == NULL) { - prog = orig_prog; + if (ctx.imms == NULL) goto out_off; - } } #else /* there's nothing about the epilogue on ARMv7 */ @@ -2238,10 +2217,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* Not able to allocate memory for the structure then * we must fall back to the interpretation */ - if (header == NULL) { - prog = orig_prog; + if (header == NULL) goto out_imms; - } /* 2.) Actual pass to generate final JIT code */ ctx.target = (u32 *) image_ptr; @@ -2278,16 +2255,12 @@ out_imms: #endif out_off: kfree(ctx.offsets); -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); + return prog; out_free: image_ptr = NULL; bpf_jit_binary_free(header); - prog = orig_prog; goto out_imms; } diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 524b67c0867e..d310d1c35192 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2003,14 +2003,12 @@ struct arm64_jit_data { struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { int image_size, prog_size, extable_size, extable_align, extable_offset; - struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; struct bpf_binary_header *ro_header = NULL; struct arm64_jit_data *jit_data; void __percpu *priv_stack_ptr = NULL; bool was_classic = bpf_prog_was_classic(prog); int priv_stack_alloc_sz; - bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; u8 *image_ptr; @@ -2019,26 +2017,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int exentry_idx; if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - /* If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); - if (!jit_data) { - prog = orig_prog; - goto out; - } + if (!jit_data) + return prog; prog->aux->jit_data = jit_data; } priv_stack_ptr = prog->aux->priv_stack_ptr; @@ -2050,10 +2035,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) + 2 * PRIV_STACK_GUARD_SZ; priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 16, GFP_KERNEL); - if (!priv_stack_ptr) { - prog = orig_prog; + if (!priv_stack_ptr) goto out_priv_stack; - } priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz); prog->aux->priv_stack_ptr = priv_stack_ptr; @@ -2073,10 +2056,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.prog = prog; ctx.offset = kvzalloc_objs(int, prog->len + 1); - if (ctx.offset == NULL) { - prog = orig_prog; + if (ctx.offset == NULL) goto out_off; - } ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); @@ -2089,15 +2070,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * BPF line info needs ctx->offset[i] to be the offset of * instruction[i] in jited image, so build prologue first. */ - if (build_prologue(&ctx, was_classic)) { - prog = orig_prog; + if (build_prologue(&ctx, was_classic)) goto out_off; - } - if (build_body(&ctx, extra_pass)) { - prog = orig_prog; + if (build_body(&ctx, extra_pass)) goto out_off; - } ctx.epilogue_offset = ctx.idx; build_epilogue(&ctx, was_classic); @@ -2115,10 +2092,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ro_header = bpf_jit_binary_pack_alloc(image_size, &ro_image_ptr, sizeof(u64), &header, &image_ptr, jit_fill_hole); - if (!ro_header) { - prog = orig_prog; + if (!ro_header) goto out_off; - } /* Pass 2: Determine jited position and result for each instruction */ @@ -2146,10 +2121,8 @@ skip_init_ctx: /* Dont write body instructions to memory for now */ ctx.write = false; - if (build_body(&ctx, extra_pass)) { - prog = orig_prog; + if (build_body(&ctx, extra_pass)) goto out_free_hdr; - } ctx.epilogue_offset = ctx.idx; ctx.exentry_idx = exentry_idx; @@ -2158,19 +2131,15 @@ skip_init_ctx: /* Pass 3: Adjust jump offset and write final image */ if (build_body(&ctx, extra_pass) || - WARN_ON_ONCE(ctx.idx != ctx.epilogue_offset)) { - prog = orig_prog; + WARN_ON_ONCE(ctx.idx != ctx.epilogue_offset)) goto out_free_hdr; - } build_epilogue(&ctx, was_classic); build_plt(&ctx); /* Extra pass to validate JITed code. */ - if (validate_ctx(&ctx)) { - prog = orig_prog; + if (validate_ctx(&ctx)) goto out_free_hdr; - } /* update the real prog size */ prog_size = sizeof(u32) * ctx.idx; @@ -2187,16 +2156,13 @@ skip_init_ctx: if (extra_pass && ctx.idx > jit_data->ctx.idx) { pr_err_once("multi-func JIT bug %d > %d\n", ctx.idx, jit_data->ctx.idx); - prog->bpf_func = NULL; - prog->jited = 0; - prog->jited_len = 0; goto out_free_hdr; } if (WARN_ON(bpf_jit_binary_pack_finalize(ro_header, header))) { - /* ro_header has been freed */ + /* ro_header and header has been freed */ ro_header = NULL; - prog = orig_prog; - goto out_off; + header = NULL; + goto out_free_hdr; } } else { jit_data->ctx = ctx; @@ -2233,13 +2199,15 @@ out_priv_stack: kfree(jit_data); prog->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); + return prog; out_free_hdr: + if (extra_pass) { + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + } if (header) { bpf_arch_text_copy(&ro_header->size, &header->size, sizeof(header->size)); diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 9cb796e16379..fcc8c0c29fb0 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1922,43 +1922,26 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { - bool tmp_blinded = false, extra_pass = false; + bool extra_pass = false; u8 *image_ptr, *ro_image_ptr; int image_size, prog_size, extable_size; struct jit_ctx ctx; struct jit_data *jit_data; struct bpf_binary_header *header; struct bpf_binary_header *ro_header; - struct bpf_prog *tmp, *orig_prog = prog; /* * If BPF JIT was not enabled then we must fall back to * the interpreter. */ if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - /* - * If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. Otherwise, we save - * the new JITed code. - */ - if (IS_ERR(tmp)) - return orig_prog; - - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); - if (!jit_data) { - prog = orig_prog; - goto out; - } + if (!jit_data) + return prog; prog->aux->jit_data = jit_data; } if (jit_data->ctx.offset) { @@ -1978,17 +1961,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); ctx.offset = kvcalloc(prog->len + 1, sizeof(u32), GFP_KERNEL); - if (ctx.offset == NULL) { - prog = orig_prog; + if (ctx.offset == NULL) goto out_offset; - } /* 1. Initial fake pass to compute ctx->idx and set ctx->flags */ build_prologue(&ctx); - if (build_body(&ctx, extra_pass)) { - prog = orig_prog; + if (build_body(&ctx, extra_pass)) goto out_offset; - } ctx.epilogue_offset = ctx.idx; build_epilogue(&ctx); @@ -2004,10 +1983,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* Now we know the size of the structure to make */ ro_header = bpf_jit_binary_pack_alloc(image_size, &ro_image_ptr, sizeof(u32), &header, &image_ptr, jit_fill_hole); - if (!ro_header) { - prog = orig_prog; + if (!ro_header) goto out_offset; - } /* 2. Now, the actual pass to generate final JIT code */ /* @@ -2027,17 +2004,13 @@ skip_init_ctx: ctx.num_exentries = 0; build_prologue(&ctx); - if (build_body(&ctx, extra_pass)) { - prog = orig_prog; + if (build_body(&ctx, extra_pass)) goto out_free; - } build_epilogue(&ctx); /* 3. Extra pass to validate JITed code */ - if (validate_ctx(&ctx)) { - prog = orig_prog; + if (validate_ctx(&ctx)) goto out_free; - } /* And we're done */ if (bpf_jit_enable > 1) @@ -2050,9 +2023,9 @@ skip_init_ctx: goto out_free; } if (WARN_ON(bpf_jit_binary_pack_finalize(ro_header, header))) { - /* ro_header has been freed */ + /* ro_header and header have been freed */ ro_header = NULL; - prog = orig_prog; + header = NULL; goto out_free; } /* @@ -2084,13 +2057,15 @@ out_offset: prog->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? tmp : orig_prog); - return prog; out_free: + if (extra_pass) { + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + } + if (header) { bpf_arch_text_copy(&ro_header->size, &header->size, sizeof(header->size)); bpf_jit_binary_pack_free(ro_header, header); diff --git a/arch/mips/net/bpf_jit_comp.c b/arch/mips/net/bpf_jit_comp.c index e355dfca4400..d2b6c955f18e 100644 --- a/arch/mips/net/bpf_jit_comp.c +++ b/arch/mips/net/bpf_jit_comp.c @@ -911,10 +911,8 @@ bool bpf_jit_needs_zext(void) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { - struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header = NULL; struct jit_context ctx; - bool tmp_blinded = false; unsigned int tmp_idx; unsigned int image_size; u8 *image_ptr; @@ -925,19 +923,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * the interpreter. */ if (!prog->jit_requested) - return orig_prog; - /* - * If constant blinding was enabled and we failed during blinding - * then we must fall back to the interpreter. Otherwise, we save - * the new JITed code. - */ - tmp = bpf_jit_blind_constants(prog); - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; memset(&ctx, 0, sizeof(ctx)); ctx.program = prog; @@ -1025,14 +1011,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog->jited_len = image_size; out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); kfree(ctx.descriptors); return prog; out_err: - prog = orig_prog; if (header) bpf_jit_binary_free(header); goto out; diff --git a/arch/parisc/net/bpf_jit_core.c b/arch/parisc/net/bpf_jit_core.c index a5eb6b51e27a..35dca372b5df 100644 --- a/arch/parisc/net/bpf_jit_core.c +++ b/arch/parisc/net/bpf_jit_core.c @@ -44,30 +44,19 @@ bool bpf_jit_needs_zext(void) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { unsigned int prog_size = 0, extable_size = 0; - bool tmp_blinded = false, extra_pass = false; - struct bpf_prog *tmp, *orig_prog = prog; + bool extra_pass = false; int pass = 0, prev_ninsns = 0, prologue_len, i; struct hppa_jit_data *jit_data; struct hppa_jit_context *ctx; if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); - if (!jit_data) { - prog = orig_prog; - goto out; - } + if (!jit_data) + return prog; prog->aux->jit_data = jit_data; } @@ -81,10 +70,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx->prog = prog; ctx->offset = kzalloc_objs(int, prog->len); - if (!ctx->offset) { - prog = orig_prog; - goto out_offset; - } + if (!ctx->offset) + goto out_err; for (i = 0; i < prog->len; i++) { prev_ninsns += 20; ctx->offset[i] = prev_ninsns; @@ -93,10 +80,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) for (i = 0; i < NR_JIT_ITERATIONS; i++) { pass++; ctx->ninsns = 0; - if (build_body(ctx, extra_pass, ctx->offset)) { - prog = orig_prog; - goto out_offset; - } + if (build_body(ctx, extra_pass, ctx->offset)) + goto out_err; ctx->body_len = ctx->ninsns; bpf_jit_build_prologue(ctx); ctx->prologue_len = ctx->ninsns - ctx->body_len; @@ -116,10 +101,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) &jit_data->image, sizeof(long), bpf_fill_ill_insns); - if (!jit_data->header) { - prog = orig_prog; - goto out_offset; - } + if (!jit_data->header) + goto out_err; ctx->insns = (u32 *)jit_data->image; /* @@ -134,8 +117,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) pr_err("bpf-jit: image did not converge in <%d passes!\n", i); if (jit_data->header) bpf_jit_binary_free(jit_data->header); - prog = orig_prog; - goto out_offset; + goto out_err; } if (extable_size) @@ -148,8 +130,7 @@ skip_init_ctx: bpf_jit_build_prologue(ctx); if (build_body(ctx, extra_pass, NULL)) { bpf_jit_binary_free(jit_data->header); - prog = orig_prog; - goto out_offset; + goto out_err; } bpf_jit_build_epilogue(ctx); @@ -160,20 +141,19 @@ skip_init_ctx: { extern int machine_restart(char *); machine_restart(""); } } + if (!prog->is_func || extra_pass) { + if (bpf_jit_binary_lock_ro(jit_data->header)) { + bpf_jit_binary_free(jit_data->header); + goto out_err; + } + bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns); + } + prog->bpf_func = (void *)ctx->insns; prog->jited = 1; prog->jited_len = prog_size; - bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns); - if (!prog->is_func || extra_pass) { - if (bpf_jit_binary_lock_ro(jit_data->header)) { - bpf_jit_binary_free(jit_data->header); - prog->bpf_func = NULL; - prog->jited = 0; - prog->jited_len = 0; - goto out_offset; - } prologue_len = ctx->epilogue_offset - ctx->body_len; for (i = 0; i < prog->len; i++) ctx->offset[i] += prologue_len; @@ -183,14 +163,19 @@ out_offset: kfree(jit_data); prog->aux->jit_data = NULL; } -out: + if (HPPA_JIT_REBOOT) { extern int machine_restart(char *); machine_restart(""); } - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); return prog; + +out_err: + if (extra_pass) { + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + } + goto out_offset; } u64 hppa_div64(u64 div, u64 divisor) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 50103b3794fb..2bae4699e78f 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -177,9 +177,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) void __percpu *priv_stack_ptr = NULL; struct bpf_binary_header *fhdr = NULL; struct bpf_binary_header *hdr = NULL; - struct bpf_prog *org_fp = fp; - struct bpf_prog *tmp_fp = NULL; - bool bpf_blinded = false; bool extra_pass = false; u8 *fimage = NULL; u32 *fcode_base = NULL; @@ -187,24 +184,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) u32 fixup_len; if (!fp->jit_requested) - return org_fp; - - tmp_fp = bpf_jit_blind_constants(org_fp); - if (IS_ERR(tmp_fp)) - return org_fp; - - if (tmp_fp != org_fp) { - bpf_blinded = true; - fp = tmp_fp; - } + return fp; jit_data = fp->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); - if (!jit_data) { - fp = org_fp; - goto out; - } + if (!jit_data) + return fp; fp->aux->jit_data = jit_data; } @@ -219,10 +205,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) priv_stack_alloc_size = round_up(fp->aux->stack_depth, 16) + 2 * PRIV_STACK_GUARD_SZ; priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_size, 16, GFP_KERNEL); - if (!priv_stack_ptr) { - fp = org_fp; + if (!priv_stack_ptr) goto out_priv_stack; - } priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_size); fp->aux->priv_stack_ptr = priv_stack_ptr; @@ -249,10 +233,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) } addrs = kcalloc(flen + 1, sizeof(*addrs), GFP_KERNEL); - if (addrs == NULL) { - fp = org_fp; - goto out_addrs; - } + if (addrs == NULL) + goto out_err; memset(&cgctx, 0, sizeof(struct codegen_context)); bpf_jit_init_reg_mapping(&cgctx); @@ -279,11 +261,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) } /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) { + if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) /* We hit something illegal or unsupported. */ - fp = org_fp; - goto out_addrs; - } + goto out_err; /* * If we have seen a tail call, we need a second pass. @@ -294,10 +274,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ if (cgctx.seen & SEEN_TAILCALL || !is_offset_in_branch_range((long)cgctx.idx * 4)) { cgctx.idx = 0; - if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) { - fp = org_fp; - goto out_addrs; - } + if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) + goto out_err; } bpf_jit_realloc_regs(&cgctx); @@ -318,10 +296,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fhdr = bpf_jit_binary_pack_alloc(alloclen, &fimage, 4, &hdr, &image, bpf_jit_fill_ill_insns); - if (!fhdr) { - fp = org_fp; - goto out_addrs; - } + if (!fhdr) + goto out_err; if (extable_len) fp->aux->extable = (void *)fimage + FUNCTION_DESCR_SIZE + proglen + fixup_len; @@ -340,8 +316,7 @@ skip_init_ctx: extra_pass)) { bpf_arch_text_copy(&fhdr->size, &hdr->size, sizeof(hdr->size)); bpf_jit_binary_pack_free(fhdr, hdr); - fp = org_fp; - goto out_addrs; + goto out_err; } bpf_jit_build_epilogue(code_base, &cgctx); @@ -363,15 +338,16 @@ skip_init_ctx: ((u64 *)image)[1] = local_paca->kernel_toc; #endif + if (!fp->is_func || extra_pass) { + if (bpf_jit_binary_pack_finalize(fhdr, hdr)) + goto out_err; + } + fp->bpf_func = (void *)fimage; fp->jited = 1; fp->jited_len = cgctx.idx * 4 + FUNCTION_DESCR_SIZE; if (!fp->is_func || extra_pass) { - if (bpf_jit_binary_pack_finalize(fhdr, hdr)) { - fp = org_fp; - goto out_addrs; - } bpf_prog_fill_jited_linfo(fp, addrs); /* * On ABI V1, executable code starts after the function @@ -398,11 +374,15 @@ out_priv_stack: jit_data->hdr = hdr; } -out: - if (bpf_blinded) - bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp); - return fp; + +out_err: + if (extra_pass) { + fp->bpf_func = NULL; + fp->jited = 0; + fp->jited_len = 0; + } + goto out_addrs; } /* diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index f7fd4afc3ca3..36f0aea8096d 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -44,29 +44,19 @@ bool bpf_jit_needs_zext(void) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { unsigned int prog_size = 0, extable_size = 0; - bool tmp_blinded = false, extra_pass = false; - struct bpf_prog *tmp, *orig_prog = prog; + bool extra_pass = false; int pass = 0, prev_ninsns = 0, i; struct rv_jit_data *jit_data; struct rv_jit_context *ctx; if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); if (!jit_data) { - prog = orig_prog; - goto out; + return prog; } prog->aux->jit_data = jit_data; } @@ -83,15 +73,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx->user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); ctx->prog = prog; ctx->offset = kzalloc_objs(int, prog->len); - if (!ctx->offset) { - prog = orig_prog; + if (!ctx->offset) goto out_offset; - } - if (build_body(ctx, extra_pass, NULL)) { - prog = orig_prog; + if (build_body(ctx, extra_pass, NULL)) goto out_offset; - } for (i = 0; i < prog->len; i++) { prev_ninsns += 32; @@ -105,10 +91,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_build_prologue(ctx, bpf_is_subprog(prog)); ctx->prologue_len = ctx->ninsns; - if (build_body(ctx, extra_pass, ctx->offset)) { - prog = orig_prog; + if (build_body(ctx, extra_pass, ctx->offset)) goto out_offset; - } ctx->epilogue_offset = ctx->ninsns; bpf_jit_build_epilogue(ctx); @@ -126,10 +110,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) &jit_data->ro_image, sizeof(u32), &jit_data->header, &jit_data->image, bpf_fill_ill_insns); - if (!jit_data->ro_header) { - prog = orig_prog; + if (!jit_data->ro_header) goto out_offset; - } /* * Use the image(RW) for writing the JITed instructions. But also save @@ -150,7 +132,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) if (i == NR_JIT_ITERATIONS) { pr_err("bpf-jit: image did not converge in <%d passes!\n", i); - prog = orig_prog; goto out_free_hdr; } @@ -163,26 +144,27 @@ skip_init_ctx: ctx->nexentries = 0; bpf_jit_build_prologue(ctx, bpf_is_subprog(prog)); - if (build_body(ctx, extra_pass, NULL)) { - prog = orig_prog; + if (build_body(ctx, extra_pass, NULL)) goto out_free_hdr; - } bpf_jit_build_epilogue(ctx); if (bpf_jit_enable > 1) bpf_jit_dump(prog->len, prog_size, pass, ctx->insns); + if (!prog->is_func || extra_pass) { + if (WARN_ON(bpf_jit_binary_pack_finalize(jit_data->ro_header, jit_data->header))) { + /* ro_header has been freed */ + jit_data->ro_header = NULL; + jit_data->header = NULL; + goto out_free_hdr; + } + } + prog->bpf_func = (void *)ctx->ro_insns + cfi_get_offset(); prog->jited = 1; prog->jited_len = prog_size - cfi_get_offset(); if (!prog->is_func || extra_pass) { - if (WARN_ON(bpf_jit_binary_pack_finalize(jit_data->ro_header, jit_data->header))) { - /* ro_header has been freed */ - jit_data->ro_header = NULL; - prog = orig_prog; - goto out_offset; - } for (i = 0; i < prog->len; i++) ctx->offset[i] = ninsns_rvoff(ctx->offset[i]); bpf_prog_fill_jited_linfo(prog, ctx->offset); @@ -191,14 +173,15 @@ out_offset: kfree(jit_data); prog->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); return prog; out_free_hdr: + if (extra_pass) { + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + } if (jit_data->header) { bpf_arch_text_copy(&jit_data->ro_header->size, &jit_data->header->size, sizeof(jit_data->header->size)); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index d08d159b6319..2dfc279b1be2 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2314,36 +2314,20 @@ static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, */ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { - struct bpf_prog *tmp, *orig_fp = fp; struct bpf_binary_header *header; struct s390_jit_data *jit_data; - bool tmp_blinded = false; bool extra_pass = false; struct bpf_jit jit; int pass; if (!fp->jit_requested) - return orig_fp; - - tmp = bpf_jit_blind_constants(fp); - /* - * If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_fp; - if (tmp != fp) { - tmp_blinded = true; - fp = tmp; - } + return fp; jit_data = fp->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); - if (!jit_data) { - fp = orig_fp; - goto out; - } + if (!jit_data) + return fp; fp->aux->jit_data = jit_data; } if (jit_data->ctx.addrs) { @@ -2356,34 +2340,27 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) memset(&jit, 0, sizeof(jit)); jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL); - if (jit.addrs == NULL) { - fp = orig_fp; - goto free_addrs; - } + if (jit.addrs == NULL) + goto out_err; /* * Three initial passes: * - 1/2: Determine clobbered registers * - 3: Calculate program size and addrs array */ for (pass = 1; pass <= 3; pass++) { - if (bpf_jit_prog(&jit, fp, extra_pass)) { - fp = orig_fp; - goto free_addrs; - } + if (bpf_jit_prog(&jit, fp, extra_pass)) + goto out_err; } /* * Final pass: Allocate and generate program */ header = bpf_jit_alloc(&jit, fp); - if (!header) { - fp = orig_fp; - goto free_addrs; - } + if (!header) + goto out_err; skip_init_ctx: if (bpf_jit_prog(&jit, fp, extra_pass)) { bpf_jit_binary_free(header); - fp = orig_fp; - goto free_addrs; + goto out_err; } if (bpf_jit_enable > 1) { bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf); @@ -2392,8 +2369,7 @@ skip_init_ctx: if (!fp->is_func || extra_pass) { if (bpf_jit_binary_lock_ro(header)) { bpf_jit_binary_free(header); - fp = orig_fp; - goto free_addrs; + goto out_err; } } else { jit_data->header = header; @@ -2411,11 +2387,16 @@ free_addrs: kfree(jit_data); fp->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(fp, fp == orig_fp ? - tmp : orig_fp); + return fp; + +out_err: + if (extra_pass) { + fp->bpf_func = NULL; + fp->jited = 0; + fp->jited_len = 0; + } + goto free_addrs; } bool bpf_jit_supports_kfunc_call(void) diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index b23d1c645ae5..e83e29137566 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1479,37 +1479,22 @@ struct sparc64_jit_data { struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { - struct bpf_prog *tmp, *orig_prog = prog; struct sparc64_jit_data *jit_data; struct bpf_binary_header *header; u32 prev_image_size, image_size; - bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; u8 *image_ptr; int pass, i; if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - /* If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); - if (!jit_data) { - prog = orig_prog; - goto out; - } + if (!jit_data) + return prog; prog->aux->jit_data = jit_data; } if (jit_data->ctx.offset) { @@ -1527,10 +1512,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.prog = prog; ctx.offset = kmalloc_array(prog->len, sizeof(unsigned int), GFP_KERNEL); - if (ctx.offset == NULL) { - prog = orig_prog; - goto out_off; - } + if (ctx.offset == NULL) + goto out_err; /* Longest sequence emitted is for bswap32, 12 instructions. Pre-cook * the offset array so that we converge faster. @@ -1543,10 +1526,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) ctx.idx = 0; build_prologue(&ctx); - if (build_body(&ctx)) { - prog = orig_prog; - goto out_off; - } + if (build_body(&ctx)) + goto out_err; build_epilogue(&ctx); if (bpf_jit_enable > 1) @@ -1569,10 +1550,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image_size = sizeof(u32) * ctx.idx; header = bpf_jit_binary_alloc(image_size, &image_ptr, sizeof(u32), jit_fill_hole); - if (header == NULL) { - prog = orig_prog; - goto out_off; - } + if (header == NULL) + goto out_err; ctx.image = (u32 *)image_ptr; skip_init_ctx: @@ -1582,8 +1561,7 @@ skip_init_ctx: if (build_body(&ctx)) { bpf_jit_binary_free(header); - prog = orig_prog; - goto out_off; + goto out_err; } build_epilogue(&ctx); @@ -1592,8 +1570,7 @@ skip_init_ctx: pr_err("bpf_jit: Failed to converge, prev_size=%u size=%d\n", prev_image_size, ctx.idx * 4); bpf_jit_binary_free(header); - prog = orig_prog; - goto out_off; + goto out_err; } if (bpf_jit_enable > 1) @@ -1604,8 +1581,7 @@ skip_init_ctx: if (!prog->is_func || extra_pass) { if (bpf_jit_binary_lock_ro(header)) { bpf_jit_binary_free(header); - prog = orig_prog; - goto out_off; + goto out_err; } } else { jit_data->ctx = ctx; @@ -1624,9 +1600,14 @@ out_off: kfree(jit_data); prog->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); + return prog; + +out_err: + if (extra_pass) { + prog->bpf_func = NULL; + prog->jited = 0; + prog->jited_len = 0; + } + goto out_off; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e9b78040d703..77d00a8dec87 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3717,13 +3717,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *rw_header = NULL; struct bpf_binary_header *header = NULL; - struct bpf_prog *tmp, *orig_prog = prog; void __percpu *priv_stack_ptr = NULL; struct x64_jit_data *jit_data; int priv_stack_alloc_sz; int proglen, oldproglen = 0; struct jit_context ctx = {}; - bool tmp_blinded = false; bool extra_pass = false; bool padding = false; u8 *rw_image = NULL; @@ -3733,27 +3731,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int i; if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - /* - * If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; jit_data = prog->aux->jit_data; if (!jit_data) { jit_data = kzalloc_obj(*jit_data); - if (!jit_data) { - prog = orig_prog; - goto out; - } + if (!jit_data) + return prog; prog->aux->jit_data = jit_data; } priv_stack_ptr = prog->aux->priv_stack_ptr; @@ -3765,10 +3749,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) + 2 * PRIV_STACK_GUARD_SZ; priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL); - if (!priv_stack_ptr) { - prog = orig_prog; + if (!priv_stack_ptr) goto out_priv_stack; - } priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz); prog->aux->priv_stack_ptr = priv_stack_ptr; @@ -3786,10 +3768,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto skip_init_addrs; } addrs = kvmalloc_objs(*addrs, prog->len + 1); - if (!addrs) { - prog = orig_prog; + if (!addrs) goto out_addrs; - } /* * Before first pass, make a rough estimation of addrs[] @@ -3820,8 +3800,6 @@ out_image: sizeof(rw_header->size)); bpf_jit_binary_pack_free(header, rw_header); } - /* Fall back to interpreter mode */ - prog = orig_prog; if (extra_pass) { prog->bpf_func = NULL; prog->jited = 0; @@ -3852,10 +3830,8 @@ out_image: header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size, &image, align, &rw_header, &rw_image, jit_fill_hole); - if (!header) { - prog = orig_prog; + if (!header) goto out_addrs; - } prog->aux->extable = (void *) image + roundup(proglen, align); } oldproglen = proglen; @@ -3908,8 +3884,6 @@ out_image: prog->bpf_func = (void *)image + cfi_get_offset(); prog->jited = 1; prog->jited_len = proglen - cfi_get_offset(); - } else { - prog = orig_prog; } if (!image || !prog->is_func || extra_pass) { @@ -3925,10 +3899,7 @@ out_priv_stack: kfree(jit_data); prog->aux->jit_data = NULL; } -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); + return prog; } diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index dda423025c3d..5f259577614a 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2521,35 +2521,19 @@ bool bpf_jit_needs_zext(void) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; - struct bpf_prog *tmp, *orig_prog = prog; int proglen, oldproglen = 0; struct jit_context ctx = {}; - bool tmp_blinded = false; u8 *image = NULL; int *addrs; int pass; int i; if (!prog->jit_requested) - return orig_prog; - - tmp = bpf_jit_blind_constants(prog); - /* - * If blinding was requested and we failed during blinding, - * we must fall back to the interpreter. - */ - if (IS_ERR(tmp)) - return orig_prog; - if (tmp != prog) { - tmp_blinded = true; - prog = tmp; - } + return prog; addrs = kmalloc_objs(*addrs, prog->len); - if (!addrs) { - prog = orig_prog; - goto out; - } + if (!addrs) + return prog; /* * Before first pass, make a rough estimation of addrs[] @@ -2574,7 +2558,6 @@ out_image: image = NULL; if (header) bpf_jit_binary_free(header); - prog = orig_prog; goto out_addrs; } if (image) { @@ -2588,10 +2571,8 @@ out_image: if (proglen == oldproglen) { header = bpf_jit_binary_alloc(proglen, &image, 1, jit_fill_hole); - if (!header) { - prog = orig_prog; + if (!header) goto out_addrs; - } } oldproglen = proglen; cond_resched(); @@ -2604,16 +2585,10 @@ out_image: prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; - } else { - prog = orig_prog; } out_addrs: kfree(addrs); -out: - if (tmp_blinded) - bpf_jit_prog_release_other(prog, prog == orig_prog ? - tmp : orig_prog); return prog; } diff --git a/include/linux/filter.h b/include/linux/filter.h index f552170eacf4..9fa4d4090093 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1184,6 +1184,18 @@ static inline bool bpf_dump_raw_ok(const struct cred *cred) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); + +#ifdef CONFIG_BPF_SYSCALL +struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len); +#else +static inline struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) +{ + return ERR_PTR(-ENOTSUPP); +} +#endif /* CONFIG_BPF_SYSCALL */ + int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); static inline bool xdp_return_frame_no_direct(void) @@ -1310,9 +1322,14 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, const char *bpf_jit_get_prog_name(struct bpf_prog *prog); -struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); +struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); +static inline bool bpf_prog_need_blind(const struct bpf_prog *prog) +{ + return prog->blinding_requested && !prog->blinded; +} + static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { @@ -1451,6 +1468,20 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } +static inline bool bpf_prog_need_blind(const struct bpf_prog *prog) +{ + return false; +} + +static inline +struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog) +{ + return prog; +} + +static inline void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +{ +} #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 066b86e7233c..fc9fb3c07866 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1508,7 +1508,11 @@ static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) #endif } -struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +/* + * Now this function is used only to blind the main prog and must be invoked only when + * bpf_prog_need_blind() returns true. + */ +struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; struct bpf_prog *clone, *tmp; @@ -1516,13 +1520,17 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!prog->blinding_requested || prog->blinded) - return prog; + if (WARN_ON_ONCE(env && env->prog != prog)) + return ERR_PTR(-EINVAL); clone = bpf_prog_clone_create(prog, GFP_USER); if (!clone) return ERR_PTR(-ENOMEM); + /* make sure bpf_patch_insn_data() patches the correct prog */ + if (env) + env->prog = clone; + insn_cnt = clone->len; insn = clone->insnsi; @@ -1550,21 +1558,35 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) if (!rewritten) continue; - tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); - if (IS_ERR(tmp)) { + if (env) + tmp = bpf_patch_insn_data(env, i, insn_buff, rewritten); + else + tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); + + if (IS_ERR_OR_NULL(tmp)) { + if (env) + /* restore the original prog */ + env->prog = prog; /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); - return tmp; + return IS_ERR(tmp) ? tmp : ERR_PTR(-ENOMEM); } clone = tmp; insn_delta = rewritten - 1; - /* Instructions arrays must be updated using absolute xlated offsets */ - adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten); + if (env) + env->prog = clone; + else + /* + * Instructions arrays must be updated using absolute xlated offsets. + * The arrays have already been adjusted by bpf_patch_insn_data() when + * env is not NULL. + */ + adjust_insn_arrays(clone, i, rewritten); /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; @@ -2533,6 +2555,35 @@ static bool bpf_prog_select_interpreter(struct bpf_prog *fp) return select_interpreter; } +static struct bpf_prog *bpf_prog_jit_compile(struct bpf_prog *prog) +{ +#ifdef CONFIG_BPF_JIT + struct bpf_prog *orig_prog; + + if (!bpf_prog_need_blind(prog)) + return bpf_int_jit_compile(prog); + + orig_prog = prog; + prog = bpf_jit_blind_constants(NULL, prog); + /* + * If blinding was requested and we failed during blinding, we must fall + * back to the interpreter. + */ + if (IS_ERR(prog)) + return orig_prog; + + prog = bpf_int_jit_compile(prog); + if (prog->jited) { + bpf_jit_prog_release_other(prog, orig_prog); + return prog; + } + + bpf_jit_prog_release_other(orig_prog, prog); + prog = orig_prog; +#endif + return prog; +} + /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with BPF program @@ -2572,7 +2623,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) if (*err) return fp; - fp = bpf_int_jit_compile(fp); + fp = bpf_prog_jit_compile(fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { *err = -ENOTSUPP; diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index dd00a680e4ea..721b830b5ef2 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -232,8 +232,8 @@ static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) } } -static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, - const struct bpf_insn *patch, u32 len) +struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) { struct bpf_prog *new_prog; struct bpf_insn_aux_data *new_data = NULL; @@ -973,7 +973,47 @@ patch_insn_buf: return 0; } -int bpf_jit_subprogs(struct bpf_verifier_env *env) +static u32 *bpf_dup_subprog_starts(struct bpf_verifier_env *env) +{ + u32 *starts = NULL; + + starts = kvmalloc_objs(u32, env->subprog_cnt, GFP_KERNEL_ACCOUNT); + if (starts) { + for (int i = 0; i < env->subprog_cnt; i++) + starts[i] = env->subprog_info[i].start; + } + return starts; +} + +static void bpf_restore_subprog_starts(struct bpf_verifier_env *env, u32 *orig_starts) +{ + for (int i = 0; i < env->subprog_cnt; i++) + env->subprog_info[i].start = orig_starts[i]; + /* restore the start of fake 'exit' subprog as well */ + env->subprog_info[env->subprog_cnt].start = env->prog->len; +} + +static struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env) +{ + size_t size; + void *new_aux; + + size = array_size(sizeof(struct bpf_insn_aux_data), env->prog->len); + new_aux = __vmalloc(size, GFP_KERNEL_ACCOUNT); + if (new_aux) + memcpy(new_aux, env->insn_aux_data, size); + return new_aux; +} + +static void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *orig_insn_aux) +{ + /* the expanded elements are zero-filled, so no special handling is required */ + vfree(env->insn_aux_data); + env->insn_aux_data = orig_insn_aux; +} + +static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; @@ -981,10 +1021,6 @@ int bpf_jit_subprogs(struct bpf_verifier_env *env) struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; - int old_len, subprog_start_adjustment = 0; - - if (env->subprog_cnt <= 1) - return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn)) @@ -1053,10 +1089,11 @@ int bpf_jit_subprogs(struct bpf_verifier_env *env) goto out_free; func[i]->is_func = 1; func[i]->sleepable = prog->sleepable; + func[i]->blinded = prog->blinded; func[i]->aux->func_idx = i; /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; - func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment; + func[i]->aux->subprog_start = subprog_start; func[i]->aux->func_info = prog->aux->func_info; func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; @@ -1113,15 +1150,7 @@ int bpf_jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->token = prog->aux->token; if (!i) func[i]->aux->exception_boundary = env->seen_exception; - - /* - * To properly pass the absolute subprog start to jit - * all instruction adjustments should be accumulated - */ - old_len = func[i]->len; func[i] = bpf_int_jit_compile(func[i]); - subprog_start_adjustment += func[i]->len - old_len; - if (!func[i]->jited) { err = -ENOTSUPP; goto out_free; @@ -1247,16 +1276,87 @@ out_free: } kfree(func); out_undo_insn: + bpf_prog_jit_attempt_done(prog); + return err; +} + +int bpf_jit_subprogs(struct bpf_verifier_env *env) +{ + int err, i; + bool blinded = false; + struct bpf_insn *insn; + struct bpf_prog *prog, *orig_prog; + struct bpf_insn_aux_data *orig_insn_aux; + u32 *orig_subprog_starts; + + if (env->subprog_cnt <= 1) + return 0; + + prog = orig_prog = env->prog; + if (bpf_prog_need_blind(prog)) { + orig_insn_aux = bpf_dup_insn_aux_data(env); + if (!orig_insn_aux) { + err = -ENOMEM; + goto out_cleanup; + } + orig_subprog_starts = bpf_dup_subprog_starts(env); + if (!orig_subprog_starts) { + vfree(orig_insn_aux); + err = -ENOMEM; + goto out_cleanup; + } + prog = bpf_jit_blind_constants(env, prog); + if (IS_ERR(prog)) { + err = -ENOMEM; + prog = orig_prog; + goto out_restore; + } + blinded = true; + } + + err = jit_subprogs(env); + if (err) + goto out_jit_err; + + if (blinded) { + bpf_jit_prog_release_other(prog, orig_prog); + kvfree(orig_subprog_starts); + vfree(orig_insn_aux); + } + + return 0; + +out_jit_err: + if (blinded) { + bpf_jit_prog_release_other(orig_prog, prog); + /* roll back to the clean original prog */ + prog = env->prog = orig_prog; + goto out_restore; + } else { + if (err != -EFAULT) { + /* + * We will fall back to interpreter mode when err is not -EFAULT, before + * that, insn->off and insn->imm should be restored to their original + * values since they were modified by jit_subprogs. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (!bpf_pseudo_call(insn)) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + } + goto out_cleanup; + } + +out_restore: + bpf_restore_subprog_starts(env, orig_subprog_starts); + bpf_restore_insn_aux_data(env, orig_insn_aux); + kvfree(orig_subprog_starts); +out_cleanup: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; prog->blinding_requested = 0; - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (!bpf_pseudo_call(insn)) - continue; - insn->off = 0; - insn->imm = env->insn_aux_data[i].call_imm; - } - bpf_prog_jit_attempt_done(prog); return err; } From d9ef13f72711f2dad64cd4445472ded98fb6c954 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:38 +0000 Subject: [PATCH 20/32] bpf: Pass bpf_verifier_env to JIT Pass bpf_verifier_env to bpf_int_jit_compile(). The follow-up patch will use env->insn_aux_data in the JIT stage to detect indirect jump targets. Since bpf_prog_select_runtime() can be called by cbpf and lib/test_bpf.c code without verifier, introduce helper __bpf_prog_select_runtime() to accept the env parameter. Remove the call to bpf_prog_select_runtime() in bpf_prog_load(), and switch to call __bpf_prog_select_runtime() in the verifier, with env variable passed. The original bpf_prog_select_runtime() is preserved for cbpf and lib/test_bpf.c, where env is NULL. Now all constants blinding calls are moved into the verifier, except the cbpf and lib/test_bpf.c cases. The instructions arrays are adjusted by bpf_patch_insn_data() function for normal cases, so there is no need to call adjust_insn_arrays() in bpf_jit_blind_constants(). Remove it. Reviewed-by: Anton Protopopov # v8 Reviewed-by: Emil Tsalapatis # v12 Acked-by: Hengqi Chen # v14 Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/arc/net/bpf_jit_core.c | 2 +- arch/arm/net/bpf_jit_32.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/loongarch/net/bpf_jit.c | 2 +- arch/mips/net/bpf_jit_comp.c | 2 +- arch/parisc/net/bpf_jit_core.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/riscv/net/bpf_jit_core.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp32.c | 2 +- include/linux/filter.h | 17 ++++++- kernel/bpf/core.c | 86 ++++++++++++++++---------------- kernel/bpf/fixups.c | 10 ++-- kernel/bpf/syscall.c | 4 -- kernel/bpf/verifier.c | 14 +++--- 17 files changed, 84 insertions(+), 71 deletions(-) diff --git a/arch/arc/net/bpf_jit_core.c b/arch/arc/net/bpf_jit_core.c index 973ceae48675..639a2736f029 100644 --- a/arch/arc/net/bpf_jit_core.c +++ b/arch/arc/net/bpf_jit_core.c @@ -1400,7 +1400,7 @@ static struct bpf_prog *do_extra_pass(struct bpf_prog *prog) * (re)locations involved that their addresses are not known * during the first run. */ -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { vm_dump(prog); diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index e6b1bb2de627..1628b6fc70a4 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -2142,7 +2142,7 @@ bool bpf_jit_needs_zext(void) return true; } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_binary_header *header; struct jit_ctx ctx; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index d310d1c35192..bd8757952507 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2000,7 +2000,7 @@ struct arm64_jit_data { struct jit_ctx ctx; }; -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { int image_size, prog_size, extable_size, extable_align, extable_offset; struct bpf_binary_header *header; diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index fcc8c0c29fb0..5149ce4cef7e 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1920,7 +1920,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, return ret < 0 ? ret : ret * LOONGARCH_INSN_SIZE; } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { bool extra_pass = false; u8 *image_ptr, *ro_image_ptr; diff --git a/arch/mips/net/bpf_jit_comp.c b/arch/mips/net/bpf_jit_comp.c index d2b6c955f18e..6ee4abe6a1f7 100644 --- a/arch/mips/net/bpf_jit_comp.c +++ b/arch/mips/net/bpf_jit_comp.c @@ -909,7 +909,7 @@ bool bpf_jit_needs_zext(void) return true; } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; struct jit_context ctx; diff --git a/arch/parisc/net/bpf_jit_core.c b/arch/parisc/net/bpf_jit_core.c index 35dca372b5df..172770132440 100644 --- a/arch/parisc/net/bpf_jit_core.c +++ b/arch/parisc/net/bpf_jit_core.c @@ -41,7 +41,7 @@ bool bpf_jit_needs_zext(void) return true; } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { unsigned int prog_size = 0, extable_size = 0; bool extra_pass = false; diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 2bae4699e78f..53ab97ad6074 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -162,7 +162,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size } } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *fp) { u32 proglen; u32 alloclen; diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 36f0aea8096d..4365d07aaf54 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -41,7 +41,7 @@ bool bpf_jit_needs_zext(void) return true; } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { unsigned int prog_size = 0, extable_size = 0; bool extra_pass = false; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 2dfc279b1be2..94128fe6be23 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2312,7 +2312,7 @@ static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit, /* * Compile eBPF program "fp" */ -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *fp) { struct bpf_binary_header *header; struct s390_jit_data *jit_data; diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index e83e29137566..2fa0e9375127 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1477,7 +1477,7 @@ struct sparc64_jit_data { struct jit_ctx ctx; }; -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct sparc64_jit_data *jit_data; struct bpf_binary_header *header; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 77d00a8dec87..72d9a5faa230 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3713,7 +3713,7 @@ struct x64_jit_data { #define MAX_PASSES 20 #define PADDING_PASSES (MAX_PASSES - 5) -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_binary_header *rw_header = NULL; struct bpf_binary_header *header = NULL; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 5f259577614a..852baf2e4db4 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2518,7 +2518,7 @@ bool bpf_jit_needs_zext(void) return true; } -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; int proglen, oldproglen = 0; diff --git a/include/linux/filter.h b/include/linux/filter.h index 9fa4d4090093..1ec6d5ba64cc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1108,6 +1108,8 @@ sk_filter_reason(struct sock *sk, struct sk_buff *skb) return sk_filter_trim_cap(sk, skb, 1); } +struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct bpf_prog *fp, + int *err); struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); @@ -1153,7 +1155,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ (void *)__bpf_call_base) -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); +struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_inlines_helper_call(s32 imm); @@ -1188,12 +1190,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, #ifdef CONFIG_BPF_SYSCALL struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len); +struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env); +void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *orig_insn_aux); #else static inline struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { return ERR_PTR(-ENOTSUPP); } + +static inline struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env) +{ + return NULL; +} + +static inline void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *orig_insn_aux) +{ +} #endif /* CONFIG_BPF_SYSCALL */ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index fc9fb3c07866..79361aa11757 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1491,23 +1491,6 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) bpf_prog_clone_free(fp_other); } -static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) -{ -#ifdef CONFIG_BPF_SYSCALL - struct bpf_map *map; - int i; - - if (len <= 1) - return; - - for (i = 0; i < prog->aux->used_map_cnt; i++) { - map = prog->aux->used_maps[i]; - if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) - bpf_insn_array_adjust(map, off, len); - } -#endif -} - /* * Now this function is used only to blind the main prog and must be invoked only when * bpf_prog_need_blind() returns true. @@ -1580,13 +1563,6 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bp if (env) env->prog = clone; - else - /* - * Instructions arrays must be updated using absolute xlated offsets. - * The arrays have already been adjusted by bpf_patch_insn_data() when - * env is not NULL. - */ - adjust_insn_arrays(clone, i, rewritten); /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; @@ -2555,47 +2531,55 @@ static bool bpf_prog_select_interpreter(struct bpf_prog *fp) return select_interpreter; } -static struct bpf_prog *bpf_prog_jit_compile(struct bpf_prog *prog) +static struct bpf_prog *bpf_prog_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { #ifdef CONFIG_BPF_JIT struct bpf_prog *orig_prog; + struct bpf_insn_aux_data *orig_insn_aux; if (!bpf_prog_need_blind(prog)) - return bpf_int_jit_compile(prog); + return bpf_int_jit_compile(env, prog); + + if (env) { + /* + * If env is not NULL, we are called from the end of bpf_check(), at this + * point, only insn_aux_data is used after failure, so it should be restored + * on failure. + */ + orig_insn_aux = bpf_dup_insn_aux_data(env); + if (!orig_insn_aux) + return prog; + } orig_prog = prog; - prog = bpf_jit_blind_constants(NULL, prog); + prog = bpf_jit_blind_constants(env, prog); /* * If blinding was requested and we failed during blinding, we must fall * back to the interpreter. */ if (IS_ERR(prog)) - return orig_prog; + goto out_restore; - prog = bpf_int_jit_compile(prog); + prog = bpf_int_jit_compile(env, prog); if (prog->jited) { bpf_jit_prog_release_other(prog, orig_prog); + if (env) + vfree(orig_insn_aux); return prog; } bpf_jit_prog_release_other(orig_prog, prog); + +out_restore: prog = orig_prog; + if (env) + bpf_restore_insn_aux_data(env, orig_insn_aux); #endif return prog; } -/** - * bpf_prog_select_runtime - select exec runtime for BPF program - * @fp: bpf_prog populated with BPF program - * @err: pointer to error variable - * - * Try to JIT eBPF program, if JIT is not available, use interpreter. - * The BPF program will be executed via bpf_prog_run() function. - * - * Return: the &fp argument along with &err set to 0 for success or - * a negative errno code on failure - */ -struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) +struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct bpf_prog *fp, + int *err) { /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. @@ -2623,7 +2607,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) if (*err) return fp; - fp = bpf_prog_jit_compile(fp); + fp = bpf_prog_jit_compile(env, fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { *err = -ENOTSUPP; @@ -2649,6 +2633,22 @@ finalize: return fp; } + +/** + * bpf_prog_select_runtime - select exec runtime for BPF program + * @fp: bpf_prog populated with BPF program + * @err: pointer to error variable + * + * Try to JIT eBPF program, if JIT is not available, use interpreter. + * The BPF program will be executed via bpf_prog_run() function. + * + * Return: the &fp argument along with &err set to 0 for success or + * a negative errno code on failure + */ +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) +{ + return __bpf_prog_select_runtime(NULL, fp, err); +} EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static unsigned int __bpf_prog_ret1(const void *ctx, @@ -3136,7 +3136,7 @@ const struct bpf_func_proto bpf_tail_call_proto = { * It is encouraged to implement bpf_int_jit_compile() instead, so that * eBPF and implicitly also cBPF can get JITed! */ -struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { return prog; } diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 721b830b5ef2..6c86980cc9e8 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -993,7 +993,7 @@ static void bpf_restore_subprog_starts(struct bpf_verifier_env *env, u32 *orig_s env->subprog_info[env->subprog_cnt].start = env->prog->len; } -static struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env) +struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env) { size_t size; void *new_aux; @@ -1005,8 +1005,8 @@ static struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env * return new_aux; } -static void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_insn_aux_data *orig_insn_aux) +void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *orig_insn_aux) { /* the expanded elements are zero-filled, so no special handling is required */ vfree(env->insn_aux_data); @@ -1150,7 +1150,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->token = prog->aux->token; if (!i) func[i]->aux->exception_boundary = env->seen_exception; - func[i] = bpf_int_jit_compile(func[i]); + func[i] = bpf_int_jit_compile(env, func[i]); if (!func[i]->jited) { err = -ENOTSUPP; goto out_free; @@ -1194,7 +1194,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) } for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; - tmp = bpf_int_jit_compile(func[i]); + tmp = bpf_int_jit_compile(env, func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); err = -ENOTSUPP; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b73b25c63073..a3c0214ca934 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3083,10 +3083,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; - prog = bpf_prog_select_runtime(prog, &err); - if (err < 0) - goto free_used_maps; - err = bpf_prog_mark_insn_arrays_ready(prog); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e4980128151..e804e0da3500 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20155,6 +20155,14 @@ skip_full_check: adjust_btf_func(env); + /* extension progs temporarily inherit the attach_type of their targets + for verification purposes, so set it back to zero before returning + */ + if (env->prog->type == BPF_PROG_TYPE_EXT) + env->prog->expected_attach_type = 0; + + env->prog = __bpf_prog_select_runtime(env, env->prog, &ret); + err_release_maps: if (ret) release_insn_arrays(env); @@ -20166,12 +20174,6 @@ err_release_maps: if (!env->prog->aux->used_btfs) release_btfs(env); - /* extension progs temporarily inherit the attach_type of their targets - for verification purposes, so set it back to zero before returning - */ - if (env->prog->type == BPF_PROG_TYPE_EXT) - env->prog->expected_attach_type = 0; - *prog = env->prog; module_put(env->attach_btf_mod); From 07ae6c130b46cf5e3e1a7dc5c1889fefe9adc2d3 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:39 +0000 Subject: [PATCH 21/32] bpf: Add helper to detect indirect jump targets Introduce helper bpf_insn_is_indirect_target to check whether a BPF instruction is an indirect jump target. Since the verifier knows which instructions are indirect jump targets, add a new flag indirect_target to struct bpf_insn_aux_data to mark them. The verifier sets this flag when verifying an indirect jump target instruction, and the helper checks the flag to determine whether an instruction is an indirect jump target. Reviewed-by: Anton Protopopov #v8 Reviewed-by: Emil Tsalapatis #v12 Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 9 +++++---- kernel/bpf/core.c | 9 +++++++++ kernel/bpf/fixups.c | 12 ++++++++++++ kernel/bpf/verifier.c | 7 +++++++ 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0136a108d083..b4b703c90ca9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1541,6 +1541,8 @@ bool bpf_has_frame_pointer(unsigned long ip); int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); +bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog, + int insn_idx); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 53e8664cb566..b148f816f25b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -630,16 +630,17 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ - bool jmp_point; - bool prune_point; + u32 jmp_point:1; + u32 prune_point:1; /* ensure we check state equivalence and save state checkpoint and * this instruction, regardless of any heuristics */ - bool force_checkpoint; + u32 force_checkpoint:1; /* true if instruction is a call to a helper function that * accepts callback function as a parameter. */ - bool calls_callback; + u32 calls_callback:1; + u32 indirect_target:1; /* if it is an indirect jump target */ /* * CFG strongly connected component this instruction belongs to, * zero if it is a singleton SCC. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 79361aa11757..8b018ff48875 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1573,6 +1573,15 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bp clone->blinded = 1; return clone; } + +bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog, + int insn_idx) +{ + if (!env) + return false; + insn_idx += prog->aux->subprog_start; + return env->insn_aux_data[insn_idx].indirect_target; +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 6c86980cc9e8..fba9e8c00878 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -183,6 +183,18 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env, data[i].seen = old_seen; data[i].zext_dst = insn_has_def32(insn + i); } + + /* + * The indirect_target flag of the original instruction was moved to the last of the + * new instructions by the above memmove and memset, but the indirect jump target is + * actually the first instruction, so move it back. This also matches with the behavior + * of bpf_insn_array_adjust(), which preserves xlated_off to point to the first new + * instruction. + */ + if (data[off + cnt - 1].indirect_target) { + data[off].indirect_target = 1; + data[off + cnt - 1].indirect_target = 0; + } } static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e804e0da3500..1e36b9e91277 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3497,6 +3497,11 @@ static int insn_stack_access_flags(int frameno, int spi) return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; } +static void mark_indirect_target(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].indirect_target = true; +} + #define LR_FRAMENO_BITS 3 #define LR_SPI_BITS 6 #define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) @@ -17545,12 +17550,14 @@ static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *in } for (i = 0; i < n - 1; i++) { + mark_indirect_target(env, env->gotox_tmp_buf->items[i]); other_branch = push_stack(env, env->gotox_tmp_buf->items[i], env->insn_idx, env->cur_state->speculative); if (IS_ERR(other_branch)) return PTR_ERR(other_branch); } env->insn_idx = env->gotox_tmp_buf->items[n-1]; + mark_indirect_target(env, env->insn_idx); return INSN_IDX_UPDATED; } From 9a0e89dcc9be8e0ba20aeb81c330a6352261667e Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:40 +0000 Subject: [PATCH 22/32] bpf, x86: Emit ENDBR for indirect jump targets On CPUs that support CET/IBT, the indirect jump selftest triggers a kernel panic because the indirect jump targets lack ENDBR instructions. To fix it, emit an ENDBR instruction to each indirect jump target. Since the ENDBR instruction shifts the position of original jited instructions, fix the instruction address calculation wherever the addresses are used. For reference, below is a sample panic log. Missing ENDBR: bpf_prog_2e5f1c71c13ac3e0_big_jump_table+0x97/0xe1 ------------[ cut here ]------------ kernel BUG at arch/x86/kernel/cet.c:133! Oops: invalid opcode: 0000 [#1] SMP NOPTI ... ? 0xffffffffc00fb258 ? bpf_prog_2e5f1c71c13ac3e0_big_jump_table+0x97/0xe1 bpf_prog_test_run_syscall+0x110/0x2f0 ? fdget+0xba/0xe0 __sys_bpf+0xe4b/0x2590 ? __kmalloc_node_track_caller_noprof+0x1c7/0x680 ? bpf_prog_test_run_syscall+0x215/0x2f0 __x64_sys_bpf+0x21/0x30 do_syscall_64+0x85/0x620 ? bpf_prog_test_run_syscall+0x1e2/0x2f0 Fixes: 493d9e0d6083 ("bpf, x86: add support for indirect jumps") Reviewed-by: Anton Protopopov # v8 Reviewed-by: Emil Tsalapatis # v12 Acked-by: Leon Hwang Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-5-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 72d9a5faa230..ea9e707e8abf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -58,8 +58,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT_ENDBR() EMIT(gen_endbr(), 4) #define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4) #else -#define EMIT_ENDBR() -#define EMIT_ENDBR_POISON() +#define EMIT_ENDBR() do { } while (0) +#define EMIT_ENDBR_POISON() do { } while (0) #endif static bool is_imm8(int value) @@ -1649,8 +1649,8 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, return 0; } -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, - int oldproglen, struct jit_context *ctx, bool jmp_padding) +static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *addrs, u8 *image, + u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; @@ -1663,7 +1663,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image void __percpu *priv_stack_ptr; int i, excnt = 0; int ilen, proglen = 0; - u8 *prog = temp; + u8 *ip, *prog = temp; u32 stack_depth; int err; @@ -1734,6 +1734,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image dst_reg = X86_REG_R9; } + if (bpf_insn_is_indirect_target(env, bpf_prog, i - 1)) + EMIT_ENDBR(); + + ip = image + addrs[i - 1] + (prog - temp); + switch (insn->code) { /* ALU */ case BPF_ALU | BPF_ADD | BPF_X: @@ -2440,8 +2445,6 @@ populate_extable: /* call */ case BPF_JMP | BPF_CALL: { - u8 *ip = image + addrs[i - 1]; - func = (u8 *) __bpf_call_base + imm32; if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) { LOAD_TAIL_CALL_CNT_PTR(stack_depth); @@ -2465,7 +2468,8 @@ populate_extable: if (imm32) emit_bpf_tail_call_direct(bpf_prog, &bpf_prog->aux->poke_tab[imm32 - 1], - &prog, image + addrs[i - 1], + &prog, + ip, callee_regs_used, stack_depth, ctx); @@ -2474,7 +2478,7 @@ populate_extable: &prog, callee_regs_used, stack_depth, - image + addrs[i - 1], + ip, ctx); break; @@ -2639,7 +2643,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ break; case BPF_JMP | BPF_JA | BPF_X: - emit_indirect_jump(&prog, insn->dst_reg, image + addrs[i - 1]); + emit_indirect_jump(&prog, insn->dst_reg, ip); break; case BPF_JMP | BPF_JA: case BPF_JMP32 | BPF_JA: @@ -2729,8 +2733,6 @@ emit_jmp: ctx->cleanup_addr = proglen; if (bpf_prog_was_classic(bpf_prog) && !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) { - u8 *ip = image + addrs[i - 1]; - if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) return -EINVAL; } @@ -3791,7 +3793,7 @@ skip_init_addrs: for (pass = 0; pass < MAX_PASSES || image; pass++) { if (!padding && pass >= PADDING_PASSES) padding = true; - proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding); + proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; From f6606a44bc438ec5f1d450d0153878e80e79ff80 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Thu, 16 Apr 2026 06:43:41 +0000 Subject: [PATCH 23/32] bpf, arm64: Emit BTI for indirect jump target On CPUs that support BTI, the indirect jump selftest triggers a kernel panic because there is no BTI instructions at the indirect jump targets. Fix it by emitting a BTI instruction for each indirect jump target. For reference, below is a sample panic log. Internal error: Oops - BTI: 0000000036000003 [#1] SMP ... Call trace: bpf_prog_2e5f1c71c13ac3e0_big_jump_table+0x54/0xf8 (P) bpf_prog_run_pin_on_cpu+0x140/0x468 bpf_prog_test_run_syscall+0x280/0x3b8 bpf_prog_test_run+0x22c/0x2c0 Fixes: f4a66cf1cb14 ("bpf: arm64: Add support for indirect jumps") Reviewed-by: Anton Protopopov # v8 Reviewed-by: Emil Tsalapatis # v12 Acked-by: Leon Hwang Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260416064341.151802-6-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index bd8757952507..0816c40fc7af 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1197,8 +1197,8 @@ static int add_exception_handler(const struct bpf_insn *insn, * >0 - successfully JITed a 16-byte eBPF instruction. * <0 - failed to JIT. */ -static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, - bool extra_pass) +static int build_insn(const struct bpf_verifier_env *env, const struct bpf_insn *insn, + struct jit_ctx *ctx, bool extra_pass) { const u8 code = insn->code; u8 dst = bpf2a64[insn->dst_reg]; @@ -1223,6 +1223,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, int ret; bool sign_extend; + if (bpf_insn_is_indirect_target(env, ctx->prog, i)) + emit_bti(A64_BTI_J, ctx); + switch (code) { /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: @@ -1898,7 +1901,7 @@ emit_cond_jmp: return 0; } -static int build_body(struct jit_ctx *ctx, bool extra_pass) +static int build_body(struct bpf_verifier_env *env, struct jit_ctx *ctx, bool extra_pass) { const struct bpf_prog *prog = ctx->prog; int i; @@ -1917,7 +1920,7 @@ static int build_body(struct jit_ctx *ctx, bool extra_pass) int ret; ctx->offset[i] = ctx->idx; - ret = build_insn(insn, ctx, extra_pass); + ret = build_insn(env, insn, ctx, extra_pass); if (ret > 0) { i++; ctx->offset[i] = ctx->idx; @@ -2073,7 +2076,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr if (build_prologue(&ctx, was_classic)) goto out_off; - if (build_body(&ctx, extra_pass)) + if (build_body(env, &ctx, extra_pass)) goto out_off; ctx.epilogue_offset = ctx.idx; @@ -2121,7 +2124,7 @@ skip_init_ctx: /* Dont write body instructions to memory for now */ ctx.write = false; - if (build_body(&ctx, extra_pass)) + if (build_body(env, &ctx, extra_pass)) goto out_free_hdr; ctx.epilogue_offset = ctx.idx; @@ -2130,7 +2133,7 @@ skip_init_ctx: ctx.write = true; /* Pass 3: Adjust jump offset and write final image */ - if (build_body(&ctx, extra_pass) || + if (build_body(env, &ctx, extra_pass) || WARN_ON_ONCE(ctx.idx != ctx.epilogue_offset)) goto out_free_hdr; From e5f635edd393aeaa7cad9e42831d397e6e2e1eed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Apr 2026 14:27:19 +0200 Subject: [PATCH 24/32] bpf: Fix precedence bug in convert_bpf_ld_abs alignment check Fix an operator precedence issue in convert_bpf_ld_abs() where the expression offset + ip_align % size evaluates as offset + (ip_align % size) due to % having higher precedence than +. That latter evaluation does not make any sense. The intended check is (offset + ip_align) % size == 0 to verify that the packet load offset is properly aligned for direct access. With NET_IP_ALIGN == 2, the bug causes the inline fast-path for direct packet loads to almost never be taken on !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS platforms. This forces nearly all cBPF BPF_LD_ABS packet loads through the bpf_skb_load_helper slow path on the affected archs. Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf") Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260416122719.661033-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index fcfcb72663ca..5fa9189eb772 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -503,7 +503,7 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) ((unaligned_ok && offset >= 0) || (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && - offset + ip_align % size == 0))) { + (offset + ip_align) % size == 0))) { bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); From 4d0a375887ab4d49e4da1ff10f9606cab8f7c3ad Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Thu, 16 Apr 2026 11:08:07 -0700 Subject: [PATCH 25/32] bpf: Fix NULL deref in map_kptr_match_type for scalar regs Commit ab6c637ad027 ("bpf: Fix a bpf_kptr_xchg() issue with local kptr") refactored map_kptr_match_type() to branch on btf_is_kernel() before checking base_type(). A scalar register stored into a kptr slot has no btf, so the btf_is_kernel(reg->btf) call dereferences NULL. Move the base_type() != PTR_TO_BTF_ID guard before any reg->btf access. Fixes: ab6c637ad027 ("bpf: Fix a bpf_kptr_xchg() issue with local kptr") Reported-by: Hiker Cl Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221372 Signed-off-by: Mykyta Yatsenko Acked-by: Paul Chaignon Link: https://lore.kernel.org/r/20260416-kptr_crash-v1-1-5589356584b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e36b9e91277..69d75515ed3f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4549,6 +4549,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, int perm_flags; const char *reg_name = ""; + if (base_type(reg->type) != PTR_TO_BTF_ID) + goto bad_type; + if (btf_is_kernel(reg->btf)) { perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; @@ -4561,7 +4564,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, perm_flags |= MEM_PERCPU; } - if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) + if (type_flag(reg->type) & ~perm_flags) goto bad_type; /* We need to verify reg->type and reg->btf, before accessing reg->btf */ From fcd11ff8bd0e526bdd5f43f534ccf7c4e67245ad Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Thu, 16 Apr 2026 11:08:08 -0700 Subject: [PATCH 26/32] selftests/bpf: Reject scalar store into kptr slot Verify that the verifier rejects a direct scalar write to a kptr map value slot without crashing. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260416-kptr_crash-v1-2-5589356584b4@meta.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 6443b320c732..ee053b24e6ca 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -385,4 +385,19 @@ int kptr_xchg_possibly_null(struct __sk_buff *ctx) return 0; } +SEC("?tc") +__failure __msg("invalid kptr access, R") +int reject_scalar_store_to_kptr(struct __sk_buff *ctx) +{ + struct map_value *v; + int key = 0; + + v = bpf_map_lookup_elem(&array_map, &key); + if (!v) + return 0; + + *(volatile u64 *)&v->unref_ptr = 0xBADC0DE; + return 0; +} + char _license[] SEC("license") = "GPL"; From b960430ea8862ef37ce53c8bf74a8dc79d3f2404 Mon Sep 17 00:00:00 2001 From: Yihan Ding Date: Thu, 16 Apr 2026 20:01:41 +0800 Subject: [PATCH 27/32] bpf: allow UTF-8 literals in bpf_bprintf_prepare() bpf_bprintf_prepare() only needs ASCII parsing for conversion specifiers. Plain text can safely carry bytes >= 0x80, so allow UTF-8 literals outside '%' sequences while keeping ASCII control bytes rejected and format specifiers ASCII-only. This keeps existing parsing rules for format directives unchanged, while allowing helpers such as bpf_trace_printk() to emit UTF-8 literal text. Update test_snprintf_negative() in the same commit so selftests keep matching the new plain-text vs format-specifier split during bisection. Fixes: 48cac3f4a96d ("bpf: Implement formatted output helpers with bstr_printf") Signed-off-by: Yihan Ding Acked-by: Paul Chaignon Link: https://lore.kernel.org/r/20260416120142.1420646-2-dingyihan@uniontech.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 17 ++++++++++++++++- .../testing/selftests/bpf/prog_tests/snprintf.c | 3 ++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bb95e287b0dc..2bb60200c266 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -845,7 +845,13 @@ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, data->buf = buffers->buf; for (i = 0; i < fmt_size; i++) { - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + unsigned char c = fmt[i]; + + /* + * Permit bytes >= 0x80 in plain text so UTF-8 literals can pass + * through unchanged, while still rejecting ASCII control bytes. + */ + if (isascii(c) && !isprint(c) && !isspace(c)) { err = -EINVAL; goto out; } @@ -867,6 +873,15 @@ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, * always access fmt[i + 1], in the worst case it will be a 0 */ i++; + c = fmt[i]; + /* + * The format parser below only understands ASCII conversion + * specifiers and modifiers, so reject non-ASCII after '%'. + */ + if (!isascii(c)) { + err = -EINVAL; + goto out; + } /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c index 594441acb707..4e4a82d54f79 100644 --- a/tools/testing/selftests/bpf/prog_tests/snprintf.c +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -114,7 +114,8 @@ static void test_snprintf_negative(void) ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5"); ASSERT_ERR(load_single_snprintf("%lc"), "invalid specifier 6"); ASSERT_ERR(load_single_snprintf("%llc"), "invalid specifier 7"); - ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character"); + ASSERT_OK(load_single_snprintf("\x80"), "non ascii plain text"); + ASSERT_ERR(load_single_snprintf("%\x80"), "non ascii in specifier"); ASSERT_ERR(load_single_snprintf("\x1"), "non printable character"); ASSERT_ERR(load_single_snprintf("%p%"), "invalid specifier 8"); ASSERT_ERR(load_single_snprintf("%s%"), "invalid specifier 9"); From 4198ff31edb193cb11955338ee923d9f842a4fce Mon Sep 17 00:00:00 2001 From: Yihan Ding Date: Thu, 16 Apr 2026 20:01:42 +0800 Subject: [PATCH 28/32] selftests/bpf: cover UTF-8 trace_printk output Extend trace_printk coverage to verify that UTF-8 literal text is emitted successfully and that '%' parsing still rejects non-ASCII bytes once format parsing starts. Use an explicitly invalid format string for the negative case so the ASCII-only parser expectation is visible from the test code itself. Signed-off-by: Yihan Ding Acked-by: Paul Chaignon Link: https://lore.kernel.org/r/20260416120142.1420646-3-dingyihan@uniontech.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/trace_printk.c | 28 +++++++++++++++---- .../selftests/bpf/progs/trace_printk.c | 10 +++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c index e56e88596d64..a5a8104c1ddd 100644 --- a/tools/testing/selftests/bpf/prog_tests/trace_printk.c +++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c @@ -6,18 +6,21 @@ #include "trace_printk.lskel.h" #define SEARCHMSG "testing,testing" +#define SEARCHMSG_UTF8 "中文,测试" static void trace_pipe_cb(const char *str, void *data) { if (strstr(str, SEARCHMSG) != NULL) - (*(int *)data)++; + ((int *)data)[0]++; + if (strstr(str, SEARCHMSG_UTF8)) + ((int *)data)[1]++; } void serial_test_trace_printk(void) { struct trace_printk_lskel__bss *bss; struct trace_printk_lskel *skel; - int err = 0, found = 0; + int err = 0, found[2] = {}; skel = trace_printk_lskel__open(); if (!ASSERT_OK_PTR(skel, "trace_printk__open")) @@ -46,11 +49,24 @@ void serial_test_trace_printk(void) if (!ASSERT_GT(bss->trace_printk_ret, 0, "bss->trace_printk_ret")) goto cleanup; - /* verify our search string is in the trace buffer */ - ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, &found, 1000), - "read_trace_pipe_iter"); + if (!ASSERT_GT(bss->trace_printk_utf8_ran, 0, "bss->trace_printk_utf8_ran")) + goto cleanup; - if (!ASSERT_EQ(found, bss->trace_printk_ran, "found")) + if (!ASSERT_GT(bss->trace_printk_utf8_ret, 0, "bss->trace_printk_utf8_ret")) + goto cleanup; + + if (!ASSERT_LT(bss->trace_printk_invalid_spec_ret, 0, + "bss->trace_printk_invalid_spec_ret")) + goto cleanup; + + /* verify our search strings are in the trace buffer */ + ASSERT_OK(read_trace_pipe_iter(trace_pipe_cb, found, 1000), + "read_trace_pipe_iter"); + + if (!ASSERT_EQ(found[0], bss->trace_printk_ran, "found")) + goto cleanup; + + if (!ASSERT_EQ(found[1], bss->trace_printk_utf8_ran, "found_utf8")) goto cleanup; cleanup: diff --git a/tools/testing/selftests/bpf/progs/trace_printk.c b/tools/testing/selftests/bpf/progs/trace_printk.c index 6695478c2b25..f4c538ec3ebd 100644 --- a/tools/testing/selftests/bpf/progs/trace_printk.c +++ b/tools/testing/selftests/bpf/progs/trace_printk.c @@ -10,13 +10,23 @@ char _license[] SEC("license") = "GPL"; int trace_printk_ret = 0; int trace_printk_ran = 0; +int trace_printk_invalid_spec_ret = 0; +int trace_printk_utf8_ret = 0; +int trace_printk_utf8_ran = 0; const char fmt[] = "Testing,testing %d\n"; +static const char utf8_fmt[] = "中文,测试 %d\n"; +/* Non-ASCII bytes after '%' must still be rejected. */ +static const char invalid_spec_fmt[] = "%\x80\n"; SEC("fentry/" SYS_PREFIX "sys_nanosleep") int sys_enter(void *ctx) { trace_printk_ret = bpf_trace_printk(fmt, sizeof(fmt), ++trace_printk_ran); + trace_printk_utf8_ret = bpf_trace_printk(utf8_fmt, sizeof(utf8_fmt), + ++trace_printk_utf8_ran); + trace_printk_invalid_spec_ret = bpf_trace_printk(invalid_spec_fmt, + sizeof(invalid_spec_fmt)); return 0; } From 380044c40b1636a72fd8f188b5806be6ae564279 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 16 Apr 2026 12:00:34 +0200 Subject: [PATCH 29/32] libbpf: Prevent double close and leak of btf objects Sashiko found possible double close of btf object fd [1], which happens when strdup in load_module_btfs fails at which point the obj->btf_module_cnt is already incremented. The error path close btf fd and so does later cleanup code in bpf_object_post_load_cleanup function. Also libbpf_ensure_mem failure leaves btf object not assigned and it's leaked. Replacing the err_out label with break to make the error path less confusing as suggested by Alan. Incrementing obj->btf_module_cnt only if there's no failure and releasing btf object in error path. Fixes: 91abb4a6d79d ("libbpf: Support attachment of BPF tracing programs to kernel modules") [1] https://sashiko.dev/#/patchset/20260324081846.2334094-1-jolsa%40kernel.org Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260416100034.1610852-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8b0c3246097f..3a80a018fc7d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5852,11 +5852,12 @@ static int load_module_btfs(struct bpf_object *obj) info.name = ptr_to_u64(name); info.name_len = sizeof(name); + btf = NULL; err = bpf_btf_get_info_by_fd(fd, &info, &len); if (err) { err = -errno; pr_warn("failed to get BTF object #%d info: %s\n", id, errstr(err)); - goto err_out; + break; } /* ignore non-module BTFs */ @@ -5870,15 +5871,15 @@ static int load_module_btfs(struct bpf_object *obj) if (err) { pr_warn("failed to load module [%s]'s BTF object #%d: %s\n", name, id, errstr(err)); - goto err_out; + break; } err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); if (err) - goto err_out; + break; - mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; + mod_btf = &obj->btf_modules[obj->btf_module_cnt]; mod_btf->btf = btf; mod_btf->id = id; @@ -5886,16 +5887,16 @@ static int load_module_btfs(struct bpf_object *obj) mod_btf->name = strdup(name); if (!mod_btf->name) { err = -ENOMEM; - goto err_out; + break; } - continue; - -err_out: - close(fd); - return err; + obj->btf_module_cnt++; } - return 0; + if (err) { + btf__free(btf); + close(fd); + } + return err; } static struct bpf_core_cand_list * From 2845989f2ebaf7848e4eccf9a779daf3156ea0a5 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 17 Apr 2026 08:21:33 -0700 Subject: [PATCH 30/32] bpf: Validate node_id in arena_alloc_pages() arena_alloc_pages() accepts a plain int node_id and forwards it through the entire allocation chain without any bounds checking. Validate node_id before passing it down the allocation chain in arena_alloc_pages(). Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Signed-off-by: Puranjay Mohan Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260417152135.1383754-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 9c68c9b0b24a..523c3a61063b 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -562,6 +562,10 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt u32 uaddr32; int ret, i; + if (node_id != NUMA_NO_NODE && + ((unsigned int)node_id >= nr_node_ids || !node_online(node_id))) + return 0; + if (page_cnt > page_cnt_max) return 0; From f75aeb2de89127052975b1bfade88ac87f164f4a Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 17 Apr 2026 10:49:00 -0700 Subject: [PATCH 31/32] bpf: Dissociate struct_ops program with map if map_update fails Currently, when bpf_struct_ops_map_update_elem() fails, the programs' st_ops_assoc will remain set. They may become dangling pointers if the map is freed later, but they will never be dereferenced since the struct_ops attachment did not succeed. However, if one of the programs is subsequently attached as part of another struct_ops map, its st_ops_assoc will be poisoned even though its old st_ops_assoc was stale from a failed attachment. Fix the spurious poisoned st_ops_assoc by dissociating struct_ops programs with a map if the attachment fails. Move bpf_prog_assoc_struct_ops() to after *plink++ to make sure bpf_prog_disassoc_struct_ops() will not miss a program when iterating st_map->links. Note that, dissociating a program from a map requires some attention as it must not reset a poisoned st_ops_assoc or a st_ops_assoc pointing to another map. The former is already guarded in bpf_prog_disassoc_struct_ops(). The latter also will not happen since st_ops_assoc of programs in st_map->links are set by bpf_prog_assoc_struct_ops(), which can only be poisoned or pointing to the current map. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260417174900.2895486-1-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 05b366b821c3..521cb9d7e8c7 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -811,9 +811,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } - /* Poison pointer on error instead of return for backward compatibility */ - bpf_prog_assoc_struct_ops(prog, &st_map->map); - link = kzalloc_obj(*link, GFP_USER); if (!link) { bpf_prog_put(prog); @@ -824,6 +821,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); *plink++ = &link->link; + /* Poison pointer on error instead of return for backward compatibility */ + bpf_prog_assoc_struct_ops(prog, &st_map->map); + ksym = kzalloc_obj(*ksym, GFP_USER); if (!ksym) { err = -ENOMEM; @@ -906,6 +906,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, reset_unlock: bpf_struct_ops_map_free_ksyms(st_map); bpf_struct_ops_map_free_image(st_map); + bpf_struct_ops_map_dissoc_progs(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); From e1d486445af3c392628532229f7ce5f5cf7891b6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 17 Apr 2026 07:33:52 -0700 Subject: [PATCH 32/32] bpf, arm32: Reject BPF-to-BPF calls and callbacks in the JIT The ARM32 BPF JIT does not support BPF-to-BPF function calls (BPF_PSEUDO_CALL) or callbacks (BPF_PSEUDO_FUNC), but it does not reject them either. When a program with subprograms is loaded (e.g. libxdp's XDP dispatcher uses __noinline__ subprograms, or any program using callbacks like bpf_loop or bpf_for_each_map_elem), the verifier invokes bpf_jit_subprogs() which calls bpf_int_jit_compile() for each subprogram. For BPF_PSEUDO_CALL, since ARM32 does not reject it, the JIT silently emits code using the wrong address computation: func = __bpf_call_base + imm where imm is a pc-relative subprogram offset, producing a bogus function pointer. For BPF_PSEUDO_FUNC, the ldimm64 handler ignores src_reg and loads the immediate as a normal 64-bit value without error. In both cases, build_body() reports success and a JIT image is allocated. ARM32 lacks the jit_data/extra_pass mechanism needed for the second JIT pass in bpf_jit_subprogs(). On the second pass, bpf_int_jit_compile() performs a full fresh compilation, allocating a new JIT binary and overwriting prog->bpf_func. The first allocation is never freed. bpf_jit_subprogs() then detects the function pointer changed and aborts with -ENOTSUPP, but the original JIT binary has already been leaked. Each program load/unload cycle leaks one JIT binary allocation, as reported by kmemleak: unreferenced object 0xbf0a1000 (size 4096): backtrace: bpf_jit_binary_alloc+0x64/0xfc bpf_int_jit_compile+0x14c/0x348 bpf_jit_subprogs+0x4fc/0xa60 Fix this by rejecting both BPF_PSEUDO_CALL in the BPF_CALL handler and BPF_PSEUDO_FUNC in the BPF_LD_IMM64 handler, falling through to the existing 'notyet' path. This causes build_body() to fail before any JIT binary is allocated, so bpf_int_jit_compile() returns the original program unjitted. bpf_jit_subprogs() then sees !prog->jited and cleanly falls back to the interpreter with no leak. Acked-by: Daniel Borkmann Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Reported-by: Jonas Rebmann Closes: https://lore.kernel.org/bpf/b63e9174-7a3d-4e22-8294-16df07a4af89@pengutronix.de Tested-by: Jonas Rebmann Signed-off-by: Puranjay Mohan Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260417143353.838911-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm/net/bpf_jit_32.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 1628b6fc70a4..9ede81afbc50 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1852,6 +1852,9 @@ exit: { u64 val = (u32)imm | (u64)insn[1].imm << 32; + if (insn->src_reg == BPF_PSEUDO_FUNC) + goto notyet; + emit_a32_mov_i64(dst, val, ctx); return 1; @@ -2055,6 +2058,9 @@ go_jmp: const s8 *r5 = bpf2a32[BPF_REG_5]; const u32 func = (u32)__bpf_call_base + (u32)imm; + if (insn->src_reg == BPF_PSEUDO_CALL) + goto notyet; + emit_a32_mov_r64(true, r0, r1, ctx); emit_a32_mov_r64(true, r1, r2, ctx); emit_push_r64(r5, ctx);