linux/tools/testing/selftests/sched_ext/dequeue.bpf.c

// SPDX-License-Identifier: GPL-2.0
/*
 * A scheduler that validates ops.dequeue() is called correctly:
 * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
 *   scheduler entirely: no ops.dequeue() should be called
 * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody:
 *   ops.dequeue() must be called when they leave custody
 * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by
 *   exactly one ops.dequeue() (validate 1:1 pairing and state machine)
 *
 * Copyright (c) 2026 NVIDIA Corporation.
 */

#include <scx/common.bpf.h>

#define SHARED_DSQ	0

/*
 * BPF internal queue.
 *
 * Tasks are stored here and consumed from ops.dispatch(), validating that
 * tasks on BPF internal structures still get ops.dequeue() when they
 * leave.
 */
struct {
	__uint(type, BPF_MAP_TYPE_QUEUE);
	__uint(max_entries, 32768);
	__type(value, s32);
} global_queue SEC(".maps");

char _license[] SEC("license") = "GPL";

UEI_DEFINE(uei);

/*
 * Counters to track the lifecycle of tasks:
 * - enqueue_cnt: Number of times ops.enqueue() was called
 * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
 * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
 * - change_dequeue_cnt: Number of property change dequeues
 * - bpf_queue_full: Number of times the BPF internal queue was full
 */
u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full;

/*
 * Test scenarios:
 * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
 *    dequeue callbacks expected)
 * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
 *    dequeue callbacks expected)
 * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(),
 *    consume in ops.dispatch() and dispatch to local DSQ (validates dequeue
 *    for tasks stored in internal BPF data structures)
 */
u32 test_scenario;

/*
 * Per-task state to track lifecycle and validate workflow semantics.
 * State transitions:
 *   NONE -> ENQUEUED (on enqueue)
 *   NONE -> DISPATCHED (on direct dispatch to terminal DSQ)
 *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
 *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
 *   ENQUEUED -> NONE (on property change dequeue before dispatch)
 */
enum task_state {
	TASK_NONE = 0,
	TASK_ENQUEUED,
	TASK_DISPATCHED,
};

struct task_ctx {
	enum task_state state; /* Current state in the workflow */
	u64 enqueue_seq;       /* Sequence number for debugging */
};

struct {
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
	__type(key, int);
	__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");

static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
{
	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
}

s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
		   s32 prev_cpu, u64 wake_flags)
{
	struct task_ctx *tctx;

	tctx = try_lookup_task_ctx(p);
	if (!tctx)
		return prev_cpu;

	switch (test_scenario) {
	case 0:
		/*
		 * Direct dispatch to the local DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
		tctx->state = TASK_DISPATCHED;
		break;
	case 1:
		/*
		 * Direct dispatch to the global DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
		tctx->state = TASK_DISPATCHED;
		break;
	case 2:
		/*
		 * Dispatch to a shared user DSQ.
		 *
		 * Task enters BPF scheduler management: track
		 * enqueue/dequeue lifecycle and validate state
		 * transitions.
		 */
		if (tctx->state == TASK_ENQUEUED)
			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
				      p->pid, p->comm, tctx->enqueue_seq);

		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);

		__sync_fetch_and_add(&enqueue_cnt, 1);

		tctx->state = TASK_ENQUEUED;
		tctx->enqueue_seq++;
		break;
	}

	return prev_cpu;
}

void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
{
	struct task_ctx *tctx;
	s32 pid = p->pid;

	tctx = try_lookup_task_ctx(p);
	if (!tctx)
		return;

	switch (test_scenario) {
	case 3:
		/*
		 * Direct dispatch to the local DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
		tctx->state = TASK_DISPATCHED;
		break;
	case 4:
		/*
		 * Direct dispatch to the global DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
		tctx->state = TASK_DISPATCHED;
		break;
	case 5:
		/*
		 * Dispatch to shared user DSQ.
		 *
		 * Task enters BPF scheduler management: track
		 * enqueue/dequeue lifecycle and validate state
		 * transitions.
		 */
		if (tctx->state == TASK_ENQUEUED)
			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
				      p->pid, p->comm, tctx->enqueue_seq);

		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);

		__sync_fetch_and_add(&enqueue_cnt, 1);

		tctx->state = TASK_ENQUEUED;
		tctx->enqueue_seq++;
		break;
	case 6:
		/*
		 * Store task in BPF internal queue.
		 *
		 * Task enters BPF scheduler management: track
		 * enqueue/dequeue lifecycle and validate state
		 * transitions.
		 */
		if (tctx->state == TASK_ENQUEUED)
			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
				      p->pid, p->comm, tctx->enqueue_seq);

		if (bpf_map_push_elem(&global_queue, &pid, 0)) {
			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
			__sync_fetch_and_add(&bpf_queue_full, 1);

			tctx->state = TASK_DISPATCHED;
		} else {
			__sync_fetch_and_add(&enqueue_cnt, 1);

			tctx->state = TASK_ENQUEUED;
			tctx->enqueue_seq++;
		}
		break;
	default:
		/* For all other scenarios, dispatch to the global DSQ */
		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
		tctx->state = TASK_DISPATCHED;
		break;
	}

	scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
}

void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
{
	struct task_ctx *tctx;

	__sync_fetch_and_add(&dequeue_cnt, 1);

	tctx = try_lookup_task_ctx(p);
	if (!tctx)
		return;

	/*
	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
	 * ops.dequeue() should never be called because tasks bypass the
	 * BPF scheduler entirely. If we get here, it's a kernel bug.
	 */
	if (test_scenario == 0 || test_scenario == 3) {
		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario",
			      p->pid, p->comm);
		return;
	}

	if (test_scenario == 1 || test_scenario == 4) {
		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario",
			      p->pid, p->comm);
		return;
	}

	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
		/*
		 * Property change interrupting the workflow. Valid from
		 * both ENQUEUED and DISPATCHED states. Transitions task
		 * back to NONE state.
		 */
		__sync_fetch_and_add(&change_dequeue_cnt, 1);

		/* Validate state transition */
		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);

		/*
		 * Transition back to NONE: task outside scheduler control.
		 *
		 * Scenario 6: dispatch() checks tctx->state after popping a
		 * PID, if the task is in state NONE, it was dequeued by
		 * property change and must not be dispatched (this
		 * prevents "target CPU not allowed").
		 */
		tctx->state = TASK_NONE;
	} else {
		/*
		 * Regular dispatch dequeue: kernel is moving the task from
		 * BPF custody to a terminal DSQ. Normally we come from
		 * ENQUEUED state. We can also see TASK_NONE if the task
		 * was dequeued by property change (SCX_DEQ_SCHED_CHANGE)
		 * while it was already on a DSQ (dispatched but not yet
		 * consumed); in that case we just leave state as NONE.
		 */
		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);

		/*
		 * Must be ENQUEUED (normal path) or NONE (already dequeued
		 * by property change while on a DSQ).
		 */
		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE)
			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);

		if (tctx->state == TASK_ENQUEUED)
			tctx->state = TASK_DISPATCHED;

		/* NONE: leave as-is, task was already property-change dequeued */
	}
}

void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
{
	if (test_scenario == 6) {
		struct task_ctx *tctx;
		struct task_struct *p;
		s32 pid;

		if (bpf_map_pop_elem(&global_queue, &pid))
			return;

		p = bpf_task_from_pid(pid);
		if (!p)
			return;

		/*
		 * If the task was dequeued by property change
		 * (ops.dequeue() set tctx->state = TASK_NONE), skip
		 * dispatch.
		 */
		tctx = try_lookup_task_ctx(p);
		if (!tctx || tctx->state == TASK_NONE) {
			bpf_task_release(p);
			return;
		}

		/*
		 * Dispatch to this CPU's local DSQ if allowed, otherwise
		 * fallback to the global DSQ.
		 */
		if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
		else
			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);

		bpf_task_release(p);
	} else {
		scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
	}
}

s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
		   struct scx_init_task_args *args)
{
	struct task_ctx *tctx;

	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
				   BPF_LOCAL_STORAGE_GET_F_CREATE);
	if (!tctx)
		return -ENOMEM;

	return 0;
}

s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
{
	s32 ret;

	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
	if (ret)
		return ret;

	return 0;
}

void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
{
	UEI_RECORD(uei, ei);
}

SEC(".struct_ops.link")
struct sched_ext_ops dequeue_ops = {
	.select_cpu		= (void *)dequeue_select_cpu,
	.enqueue		= (void *)dequeue_enqueue,
	.dequeue		= (void *)dequeue_dequeue,
	.dispatch		= (void *)dequeue_dispatch,
	.init_task		= (void *)dequeue_init_task,
	.init			= (void *)dequeue_init,
	.exit			= (void *)dequeue_exit,
	.flags			= SCX_OPS_ENQ_LAST,
	.name			= "dequeue_test",
};