Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:

 - Add a callback driven main loop for io_uring, and BPF struct_ops
   on top to allow implementing custom event loop logic

 - Decouple IOPOLL from being a ring-wide all-or-nothing setting,
   allowing IOPOLL use cases to also issue certain white listed
   non-polled opcodes

 - Timeout improvements. Migrate internal timeout storage from
   timespec64 to ktime_t for simpler arithmetic and avoid copying of
   timespec data

 - Zero-copy receive (zcrx) updates:

      - Add a device-less mode (ZCRX_REG_NODEV) for testing and
        experimentation where data flows through the copy fallback path

      - Fix two-step unregistration regression, DMA length calculations,
        xarray mark usage, and a potential 32-bit overflow in id
        shifting

      - Refactoring toward multi-area support: dedicated refill queue
        struct, consolidated DMA syncing, netmem array refilling format,
        and guard-based locking

 - Zero-copy transmit (zctx) cleanup:

      - Unify io_send_zc() and io_sendmsg_zc() into a single function

      - Add vectorized registered buffer send for IORING_OP_SEND_ZC

      - Add separate notification user_data via sqe->addr3 so
        notification and completion CQEs can be distinguished without
        extra reference counting

 - Switch struct io_ring_ctx internal bitfields to explicit flag bits
   with atomic-safe accessors, and annotate the known harmless races on
   those flags

 - Various optimizations caching ctx and other request fields in local
   variables to avoid repeated loads, and cleanups for tctx setup, ring
   fd registration, and read path early returns

* tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits)
  io_uring: unify getting ctx from passed in file descriptor
  io_uring/register: don't get a reference to the registered ring fd
  io_uring/tctx: clean up __io_uring_add_tctx_node() error handling
  io_uring/tctx: have io_uring_alloc_task_context() return tctx
  io_uring/timeout: use 'ctx' consistently
  io_uring/rw: clean up __io_read() obsolete comment and early returns
  io_uring/zcrx: use correct mmap off constants
  io_uring/zcrx: use dma_len for chunk size calculation
  io_uring/zcrx: don't clear not allocated niovs
  io_uring/zcrx: don't use mark0 for allocating xarray
  io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring()
  io_uring/zcrx: reject REG_NODEV with large rx_buf_size
  io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP
  io_uring/rsrc: use io_cache_free() to free node
  io_uring/zcrx: rename zcrx [un]register functions
  io_uring/zcrx: check ctrl op payload struct sizes
  io_uring/zcrx: cache fallback availability in zcrx ctx
  io_uring/zcrx: warn on a repeated area append
  io_uring/zcrx: consolidate dma syncing
  io_uring/zcrx: netmem array as refiling format
  ...
This commit is contained in:
Linus Torvalds
2026-04-13 16:22:30 -07:00
37 changed files with 1197 additions and 630 deletions

View File

@@ -786,10 +786,6 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
struct nvme_ctrl *ctrl = ioucmd->file->private_data;
int ret;
/* IOPOLL not supported yet */
if (issue_flags & IO_URING_F_IOPOLL)
return -EOPNOTSUPP;
ret = nvme_uring_cmd_checks(issue_flags);
if (ret)
return ret;

View File

@@ -8,6 +8,9 @@
#include <linux/llist.h>
#include <uapi/linux/io_uring.h>
struct iou_loop_params;
struct io_uring_bpf_ops;
enum {
/*
* A hint to not wake right away but delay until there are enough of
@@ -41,6 +44,8 @@ enum io_uring_cmd_flags {
IO_URING_F_COMPAT = (1 << 12),
};
struct iou_loop_params;
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -268,24 +273,30 @@ struct io_alloc_cache {
unsigned int init_clear;
};
enum {
IO_RING_F_DRAIN_NEXT = BIT(0),
IO_RING_F_OP_RESTRICTED = BIT(1),
IO_RING_F_REG_RESTRICTED = BIT(2),
IO_RING_F_OFF_TIMEOUT_USED = BIT(3),
IO_RING_F_DRAIN_ACTIVE = BIT(4),
IO_RING_F_HAS_EVFD = BIT(5),
/* all CQEs should be posted only by the submitter task */
IO_RING_F_TASK_COMPLETE = BIT(6),
IO_RING_F_LOCKLESS_CQ = BIT(7),
IO_RING_F_SYSCALL_IOPOLL = BIT(8),
IO_RING_F_POLL_ACTIVATED = BIT(9),
IO_RING_F_DRAIN_DISABLED = BIT(10),
IO_RING_F_COMPAT = BIT(11),
IO_RING_F_IOWQ_LIMITS_SET = BIT(12),
};
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
/* ring setup flags */
unsigned int flags;
unsigned int drain_next: 1;
unsigned int op_restricted: 1;
unsigned int reg_restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int has_evfd: 1;
/* all CQEs should be posted only by the submitter task */
unsigned int task_complete: 1;
unsigned int lockless_cq: 1;
unsigned int syscall_iopoll: 1;
unsigned int poll_activated: 1;
unsigned int drain_disabled: 1;
unsigned int compat: 1;
unsigned int iowq_limits_set : 1;
/* internal state flags IO_RING_F_* flags , mostly read-only */
unsigned int int_flags;
struct task_struct *submitter_task;
struct io_rings *rings;
@@ -355,6 +366,9 @@ struct io_ring_ctx {
struct io_alloc_cache rw_cache;
struct io_alloc_cache cmd_cache;
int (*loop_step)(struct io_ring_ctx *ctx,
struct iou_loop_params *);
/*
* Any cancelable uring_cmd is added to this list in
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
@@ -477,6 +491,8 @@ struct io_ring_ctx {
DECLARE_HASHTABLE(napi_ht, 4);
#endif
struct io_uring_bpf_ops *bpf_ops;
/*
* Protection for resize vs mmap races - both the mmap and resize
* side will need to grab this lock, to prevent either side from
@@ -545,6 +561,7 @@ enum {
REQ_F_HAS_METADATA_BIT,
REQ_F_IMPORT_BUFFER_BIT,
REQ_F_SQE_COPIED_BIT,
REQ_F_IOPOLL_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -638,6 +655,8 @@ enum {
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
/* ->sqe_copy() has been called, if necessary */
REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
/* request must be iopolled to completion (set in ->issue()) */
REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT),
};
struct io_tw_req {

View File

@@ -10,6 +10,8 @@
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/io_uring/zcrx.h>
/*
* this file is shared with liburing and that has to autodetect
* if linux/time_types.h is available or not, it can
@@ -341,6 +343,10 @@ enum io_uring_op {
/*
* sqe->timeout_flags
*
* IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout
* value in nanoseconds instead of
* pointing to a timespec.
*/
#define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_UPDATE (1U << 1)
@@ -349,6 +355,7 @@ enum io_uring_op {
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
#define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
@@ -1050,100 +1057,6 @@ struct io_timespec {
__u64 tv_nsec;
};
/* Zero copy receive refill queue entry */
struct io_uring_zcrx_rqe {
__u64 off;
__u32 len;
__u32 __pad;
};
struct io_uring_zcrx_cqe {
__u64 off;
__u64 __pad;
};
/* The bit from which area id is encoded into offsets */
#define IORING_ZCRX_AREA_SHIFT 48
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
struct io_uring_zcrx_offsets {
__u32 head;
__u32 tail;
__u32 rqes;
__u32 __resv2;
__u64 __resv[2];
};
enum io_uring_zcrx_area_flags {
IORING_ZCRX_AREA_DMABUF = 1,
};
struct io_uring_zcrx_area_reg {
__u64 addr;
__u64 len;
__u64 rq_area_token;
__u32 flags;
__u32 dmabuf_fd;
__u64 __resv2[2];
};
enum zcrx_reg_flags {
ZCRX_REG_IMPORT = 1,
};
enum zcrx_features {
/*
* The user can ask for the desired rx page size by passing the
* value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
*/
ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
};
/*
* Argument for IORING_REGISTER_ZCRX_IFQ
*/
struct io_uring_zcrx_ifq_reg {
__u32 if_idx;
__u32 if_rxq;
__u32 rq_entries;
__u32 flags;
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
__u64 region_ptr; /* struct io_uring_region_desc * */
struct io_uring_zcrx_offsets offsets;
__u32 zcrx_id;
__u32 rx_buf_len;
__u64 __resv[3];
};
enum zcrx_ctrl_op {
ZCRX_CTRL_FLUSH_RQ,
ZCRX_CTRL_EXPORT,
__ZCRX_CTRL_LAST,
};
struct zcrx_ctrl_flush_rq {
__u64 __resv[6];
};
struct zcrx_ctrl_export {
__u32 zcrx_fd;
__u32 __resv1[11];
};
struct zcrx_ctrl {
__u32 zcrx_id;
__u32 op; /* see enum zcrx_ctrl_op */
__u64 __resv[2];
union {
struct zcrx_ctrl_export zc_export;
struct zcrx_ctrl_flush_rq zc_flush;
};
};
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,115 @@
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
/*
* Header file for the io_uring zerocopy receive (zcrx) interface.
*
* Copyright (C) 2026 Pavel Begunkov
* Copyright (C) 2026 David Wei
* Copyright (C) Meta Platforms, Inc.
*/
#ifndef LINUX_IO_ZCRX_H
#define LINUX_IO_ZCRX_H
#include <linux/types.h>
/* Zero copy receive refill queue entry */
struct io_uring_zcrx_rqe {
__u64 off;
__u32 len;
__u32 __pad;
};
struct io_uring_zcrx_cqe {
__u64 off;
__u64 __pad;
};
/* The bit from which area id is encoded into offsets */
#define IORING_ZCRX_AREA_SHIFT 48
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
struct io_uring_zcrx_offsets {
__u32 head;
__u32 tail;
__u32 rqes;
__u32 __resv2;
__u64 __resv[2];
};
enum io_uring_zcrx_area_flags {
IORING_ZCRX_AREA_DMABUF = 1,
};
struct io_uring_zcrx_area_reg {
__u64 addr;
__u64 len;
__u64 rq_area_token;
__u32 flags;
__u32 dmabuf_fd;
__u64 __resv2[2];
};
enum zcrx_reg_flags {
ZCRX_REG_IMPORT = 1,
/*
* Register a zcrx instance without a net device. All data will be
* copied. The refill queue entries might not be automatically
* consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ.
*/
ZCRX_REG_NODEV = 2,
};
enum zcrx_features {
/*
* The user can ask for the desired rx page size by passing the
* value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
*/
ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
};
/*
* Argument for IORING_REGISTER_ZCRX_IFQ
*/
struct io_uring_zcrx_ifq_reg {
__u32 if_idx;
__u32 if_rxq;
__u32 rq_entries;
__u32 flags;
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
__u64 region_ptr; /* struct io_uring_region_desc * */
struct io_uring_zcrx_offsets offsets;
__u32 zcrx_id;
__u32 rx_buf_len;
__u64 __resv[3];
};
enum zcrx_ctrl_op {
ZCRX_CTRL_FLUSH_RQ,
ZCRX_CTRL_EXPORT,
__ZCRX_CTRL_LAST,
};
struct zcrx_ctrl_flush_rq {
__u64 __resv[6];
};
struct zcrx_ctrl_export {
__u32 zcrx_fd;
__u32 __resv1[11];
};
struct zcrx_ctrl {
__u32 zcrx_id;
__u32 op; /* see enum zcrx_ctrl_op */
__u64 __resv[2];
union {
struct zcrx_ctrl_export zc_export;
struct zcrx_ctrl_flush_rq zc_flush;
};
};
#endif /* LINUX_IO_ZCRX_H */

View File

@@ -14,3 +14,8 @@ config IO_URING_BPF
def_bool y
depends on BPF
depends on NET
config IO_URING_BPF_OPS
def_bool y
depends on IO_URING
depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF

View File

@@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
advise.o openclose.o statx.o timeout.o \
cancel.o waitid.o register.o \
truncate.o memmap.o alloc_cache.o \
query.o
query.o loop.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
@@ -25,3 +25,4 @@ obj-$(CONFIG_NET) += net.o cmd_net.o
obj-$(CONFIG_PROC_FS) += fdinfo.o
obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o
obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o

270
io_uring/bpf-ops.c Normal file
View File

@@ -0,0 +1,270 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mutex.h>
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include "io_uring.h"
#include "register.h"
#include "loop.h"
#include "memmap.h"
#include "bpf-ops.h"
static DEFINE_MUTEX(io_bpf_ctrl_mutex);
static const struct btf_type *loop_params_type;
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx, u32 nr)
{
return io_submit_sqes(ctx, nr);
}
__bpf_kfunc
__u8 *bpf_io_uring_get_region(struct io_ring_ctx *ctx, __u32 region_id,
const size_t rdwr_buf_size)
{
struct io_mapped_region *r;
lockdep_assert_held(&ctx->uring_lock);
switch (region_id) {
case IOU_REGION_MEM:
r = &ctx->param_region;
break;
case IOU_REGION_CQ:
r = &ctx->ring_region;
break;
case IOU_REGION_SQ:
r = &ctx->sq_region;
break;
default:
return NULL;
}
if (unlikely(rdwr_buf_size > io_region_size(r)))
return NULL;
return io_region_get_ptr(r);
}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(io_uring_kfunc_set)
BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL);
BTF_KFUNCS_END(io_uring_kfunc_set)
static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = {
.owner = THIS_MODULE,
.set = &io_uring_kfunc_set,
};
static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx,
struct iou_loop_params *lp)
{
return IOU_LOOP_STOP;
}
static struct io_uring_bpf_ops io_bpf_ops_stubs = {
.loop_step = io_bpf_ops__loop_step,
};
static bool bpf_io_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (type != BPF_READ)
return false;
if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
return false;
if (off % size != 0)
return false;
return btf_ctx_access(off, size, type, prog, info);
}
static int bpf_io_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg, int off,
int size)
{
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
if (t == loop_params_type) {
if (off + size <= offsetofend(struct iou_loop_params, cq_wait_idx))
return SCALAR_VALUE;
}
return -EACCES;
}
static const struct bpf_verifier_ops bpf_io_verifier_ops = {
.get_func_proto = bpf_base_func_proto,
.is_valid_access = bpf_io_is_valid_access,
.btf_struct_access = bpf_io_btf_struct_access,
};
static const struct btf_type *
io_lookup_struct_type(struct btf *btf, const char *name)
{
s32 type_id;
type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
if (type_id < 0)
return NULL;
return btf_type_by_id(btf, type_id);
}
static int bpf_io_init(struct btf *btf)
{
int ret;
loop_params_type = io_lookup_struct_type(btf, "iou_loop_params");
if (!loop_params_type) {
pr_err("io_uring: Failed to locate iou_loop_params\n");
return -EINVAL;
}
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&bpf_io_uring_kfunc_set);
if (ret) {
pr_err("io_uring: Failed to register kfuncs (%d)\n", ret);
return ret;
}
return 0;
}
static int bpf_io_check_member(const struct btf_type *t,
const struct btf_member *member,
const struct bpf_prog *prog)
{
return 0;
}
static int bpf_io_init_member(const struct btf_type *t,
const struct btf_member *member,
void *kdata, const void *udata)
{
u32 moff = __btf_member_bit_offset(t, member) / 8;
const struct io_uring_bpf_ops *uops = udata;
struct io_uring_bpf_ops *ops = kdata;
switch (moff) {
case offsetof(struct io_uring_bpf_ops, ring_fd):
ops->ring_fd = uops->ring_fd;
return 1;
}
return 0;
}
static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops)
{
if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL))
return -EOPNOTSUPP;
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
return -EOPNOTSUPP;
if (ctx->bpf_ops)
return -EBUSY;
if (WARN_ON_ONCE(!ops->loop_step))
return -EINVAL;
ops->priv = ctx;
ctx->bpf_ops = ops;
ctx->loop_step = ops->loop_step;
return 0;
}
static int bpf_io_reg(void *kdata, struct bpf_link *link)
{
struct io_uring_bpf_ops *ops = kdata;
struct io_ring_ctx *ctx;
struct file *file;
int ret = -EBUSY;
file = io_uring_ctx_get_file(ops->ring_fd, false);
if (IS_ERR(file))
return PTR_ERR(file);
ctx = file->private_data;
scoped_guard(mutex, &io_bpf_ctrl_mutex) {
guard(mutex)(&ctx->uring_lock);
ret = io_install_bpf(ctx, ops);
}
fput(file);
return ret;
}
static void io_eject_bpf(struct io_ring_ctx *ctx)
{
struct io_uring_bpf_ops *ops = ctx->bpf_ops;
if (WARN_ON_ONCE(!ops))
return;
if (WARN_ON_ONCE(ops->priv != ctx))
return;
ops->priv = NULL;
ctx->bpf_ops = NULL;
ctx->loop_step = NULL;
}
static void bpf_io_unreg(void *kdata, struct bpf_link *link)
{
struct io_uring_bpf_ops *ops = kdata;
struct io_ring_ctx *ctx;
guard(mutex)(&io_bpf_ctrl_mutex);
ctx = ops->priv;
if (ctx) {
guard(mutex)(&ctx->uring_lock);
if (WARN_ON_ONCE(ctx->bpf_ops != ops))
return;
io_eject_bpf(ctx);
}
}
void io_unregister_bpf_ops(struct io_ring_ctx *ctx)
{
/*
* ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock,
* and read protected by either. Try to avoid taking the global lock
* for rings that never had any bpf installed.
*/
scoped_guard(mutex, &ctx->uring_lock) {
if (!ctx->bpf_ops)
return;
}
guard(mutex)(&io_bpf_ctrl_mutex);
guard(mutex)(&ctx->uring_lock);
if (ctx->bpf_ops)
io_eject_bpf(ctx);
}
static struct bpf_struct_ops bpf_ring_ops = {
.verifier_ops = &bpf_io_verifier_ops,
.reg = bpf_io_reg,
.unreg = bpf_io_unreg,
.check_member = bpf_io_check_member,
.init_member = bpf_io_init_member,
.init = bpf_io_init,
.cfi_stubs = &io_bpf_ops_stubs,
.name = "io_uring_bpf_ops",
.owner = THIS_MODULE,
};
static int __init io_uring_bpf_init(void)
{
int ret;
ret = register_bpf_struct_ops(&bpf_ring_ops, io_uring_bpf_ops);
if (ret) {
pr_err("io_uring: Failed to register struct_ops (%d)\n", ret);
return ret;
}
return 0;
}
__initcall(io_uring_bpf_init);

28
io_uring/bpf-ops.h Normal file
View File

@@ -0,0 +1,28 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_BPF_OPS_H
#define IOU_BPF_OPS_H
#include <linux/io_uring_types.h>
enum {
IOU_REGION_MEM,
IOU_REGION_CQ,
IOU_REGION_SQ,
};
struct io_uring_bpf_ops {
int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp);
__u32 ring_fd;
void *priv;
};
#ifdef CONFIG_IO_URING_BPF_OPS
void io_unregister_bpf_ops(struct io_ring_ctx *ctx);
#else
static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx)
{
}
#endif
#endif /* IOU_BPF_OPS_H */

View File

@@ -156,9 +156,16 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
cancel->fd = READ_ONCE(sqe->fd);
}
if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
u32 op;
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
return -EINVAL;
cancel->opcode = READ_ONCE(sqe->len);
op = READ_ONCE(sqe->len);
if (op >= IORING_OP_LAST)
return -EINVAL;
cancel->opcode = op;
}
return 0;

View File

@@ -7,6 +7,21 @@
#include "uring_cmd.h"
#include "io_uring.h"
static int io_uring_cmd_get_sock_ioctl(struct socket *sock, int op)
{
struct sock *sk = sock->sk;
struct proto *prot = READ_ONCE(sk->sk_prot);
int ret, arg = 0;
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
ret = prot->ioctl(sk, op, &arg);
if (ret)
return ret;
return arg;
}
static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)
@@ -156,27 +171,12 @@ static int io_uring_cmd_getsockname(struct socket *sock,
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
struct sock *sk = sock->sk;
struct proto *prot = READ_ONCE(sk->sk_prot);
int ret, arg = 0;
switch (cmd->cmd_op) {
case SOCKET_URING_OP_SIOCINQ:
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
ret = prot->ioctl(sk, SIOCINQ, &arg);
if (ret)
return ret;
return arg;
return io_uring_cmd_get_sock_ioctl(sock, SIOCINQ);
case SOCKET_URING_OP_SIOCOUTQ:
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
if (ret)
return ret;
return arg;
return io_uring_cmd_get_sock_ioctl(sock, SIOCOUTQ);
case SOCKET_URING_OP_GETSOCKOPT:
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
case SOCKET_URING_OP_SETSOCKOPT:

View File

@@ -148,7 +148,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
ctx->int_flags |= IO_RING_F_HAS_EVFD;
refcount_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
@@ -162,7 +162,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
ctx->int_flags &= ~IO_RING_F_HAS_EVFD;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
io_eventfd_put(ev_fd);
return 0;

View File

@@ -87,6 +87,7 @@
#include "msg_ring.h"
#include "memmap.h"
#include "zcrx.h"
#include "bpf-ops.h"
#include "timeout.h"
#include "poll.h"
@@ -95,6 +96,7 @@
#include "eventfd.h"
#include "wait.h"
#include "bpf_filter.h"
#include "loop.h"
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -356,7 +358,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CREDS)) {
req->flags |= REQ_F_CREDS;
@@ -378,7 +379,7 @@ static void io_prep_async_work(struct io_kiocb *req)
if (should_hash && (req->file->f_flags & O_DIRECT) &&
(req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
should_hash = false;
if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
if (should_hash || (req->flags & REQ_F_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file)
@@ -477,17 +478,17 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->poll_activated)
if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED)
io_flush_timeouts(ctx);
if (ctx->has_evfd)
if (ctx->int_flags & IO_RING_F_HAS_EVFD)
io_eventfd_signal(ctx, true);
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
{
if (!ctx->lockless_cq)
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ))
spin_lock(&ctx->completion_lock);
}
@@ -500,11 +501,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
{
io_commit_cqring(ctx);
if (!ctx->task_complete) {
if (!ctx->lockless_cq)
if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) {
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ))
spin_unlock(&ctx->completion_lock);
/* IOPOLL rings only need to wake up if it's also SQPOLL */
if (!ctx->syscall_iopoll)
if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL))
io_cqring_wake(ctx);
}
io_commit_cqring_flush(ctx);
@@ -589,6 +590,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx)
{
__io_cqring_overflow_flush(ctx, false);
}
/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct io_kiocb *req)
{
@@ -830,7 +836,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
lockdep_assert_held(&ctx->uring_lock);
lockdep_assert(ctx->lockless_cq);
lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ);
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
@@ -860,7 +866,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
lockdep_assert(!io_wq_current_is_worker());
lockdep_assert_held(&ctx->uring_lock);
if (!ctx->lockless_cq) {
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) {
spin_lock(&ctx->completion_lock);
posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
spin_unlock(&ctx->completion_lock);
@@ -885,7 +891,7 @@ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2])
lockdep_assert_held(&ctx->uring_lock);
cqe[0].user_data = req->cqe.user_data;
if (!ctx->lockless_cq) {
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) {
spin_lock(&ctx->completion_lock);
posted = io_fill_cqe_aux32(ctx, cqe);
spin_unlock(&ctx->completion_lock);
@@ -913,7 +919,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
* Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
* the submitter task context, IOPOLL protects with uring_lock.
*/
if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) {
defer_complete:
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
@@ -1067,12 +1073,14 @@ void io_queue_next(struct io_kiocb *req)
static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
if (req->file_node) {
io_put_rsrc_node(req->ctx, req->file_node);
io_put_rsrc_node(ctx, req->file_node);
req->file_node = NULL;
}
if (req->flags & REQ_F_BUF_NODE)
io_put_rsrc_node(req->ctx, req->buf_node);
io_put_rsrc_node(ctx, req->buf_node);
}
static void io_free_batch_list(struct io_ring_ctx *ctx,
@@ -1135,7 +1143,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->lockless_cq)
if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ)
io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
else
io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
@@ -1148,7 +1156,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
INIT_WQ_LIST(&state->compl_reqs);
}
if (unlikely(ctx->drain_active))
if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE))
io_queue_deferred(ctx);
ctx->submit_state.cq_flush = false;
@@ -1187,7 +1195,6 @@ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
{
unsigned int nr_events = 0;
unsigned long check_cq;
min_events = min(min_events, ctx->cq_entries);
@@ -1230,8 +1237,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
* very same mutex.
*/
if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
(void) io_run_local_work_locked(ctx, min_events);
if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) {
@@ -1240,7 +1245,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
mutex_lock(&ctx->uring_lock);
}
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list))
if (list_empty(&ctx->iopoll_list))
break;
}
ret = io_do_iopoll(ctx, !min_events);
@@ -1251,9 +1256,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
return -EINTR;
if (need_resched())
break;
nr_events += ret;
} while (nr_events < min_events);
} while (io_cqring_events(ctx) < min_events);
return 0;
}
@@ -1344,7 +1347,7 @@ static __cold void io_drain_req(struct io_kiocb *req)
list_add_tail(&de->list, &ctx->defer_list);
io_queue_deferred(ctx);
if (!drain && list_empty(&ctx->defer_list))
ctx->drain_active = false;
ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE;
}
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -1418,8 +1421,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (ret == IOU_ISSUE_SKIP_COMPLETE) {
ret = 0;
/* If the op doesn't have a file, we're not polling for it */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
if (req->flags & REQ_F_IOPOLL)
io_iopoll_req_issued(req, issue_flags);
}
return ret;
@@ -1435,7 +1437,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)
io_tw_lock(req->ctx, tw);
WARN_ON_ONCE(!req->file);
if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL))
if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL))
return -EFAULT;
ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);
@@ -1533,7 +1535,7 @@ fail:
* wait for request slots on the block side.
*/
if (!needs_poll) {
if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
if (!(req->flags & REQ_F_IOPOLL))
break;
if (io_wq_worker_stopped())
break;
@@ -1655,7 +1657,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
} else {
/* can't fail with IO_URING_F_INLINE */
io_req_sqe_copy(req, IO_URING_F_INLINE);
if (unlikely(req->ctx->drain_active))
if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE))
io_drain_req(req);
else
io_queue_iowq(req);
@@ -1671,7 +1673,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
if (!ctx->op_restricted)
if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED))
return true;
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -1691,7 +1693,7 @@ static void io_init_drain(struct io_ring_ctx *ctx)
{
struct io_kiocb *head = ctx->submit_state.link.head;
ctx->drain_active = true;
ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE;
if (head) {
/*
* If we need to drain a request in the middle of a link, drain
@@ -1701,7 +1703,7 @@ static void io_init_drain(struct io_ring_ctx *ctx)
* link.
*/
head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
ctx->drain_next = true;
ctx->int_flags |= IO_RING_F_DRAIN_NEXT;
}
}
@@ -1767,23 +1769,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->buf_index = READ_ONCE(sqe->buf_group);
}
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
ctx->int_flags |= IO_RING_F_DRAIN_DISABLED;
if (sqe_flags & IOSQE_IO_DRAIN) {
if (ctx->drain_disabled)
if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED)
return io_init_fail_req(req, -EOPNOTSUPP);
io_init_drain(ctx);
}
}
if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) {
if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) {
if (!io_check_restriction(ctx, req, sqe_flags))
return io_init_fail_req(req, -EACCES);
/* knock it to the slow queue path, will be drained there */
if (ctx->drain_active)
if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)
req->flags |= REQ_F_FORCE_ASYNC;
/* if there is no link, we're at "next" request and need to drain */
if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
ctx->drain_next = false;
ctx->drain_active = true;
if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) {
ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT;
ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE;
req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
}
}
@@ -2148,12 +2150,13 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_unregister_bpf_ops(ctx);
io_sq_thread_finish(ctx);
mutex_lock(&ctx->uring_lock);
io_sqe_buffers_unregister(ctx);
io_sqe_files_unregister(ctx);
io_unregister_zcrx_ifqs(ctx);
io_unregister_zcrx(ctx);
io_cqring_overflow_kill(ctx);
io_eventfd_unregister(ctx);
io_free_alloc_caches(ctx);
@@ -2204,7 +2207,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
poll_wq_task_work);
mutex_lock(&ctx->uring_lock);
ctx->poll_activated = true;
ctx->int_flags |= IO_RING_F_POLL_ACTIVATED;
mutex_unlock(&ctx->uring_lock);
/*
@@ -2219,9 +2222,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
if (ctx->poll_activated || ctx->poll_wq_task_work.func)
if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func)
goto out;
if (WARN_ON_ONCE(!ctx->task_complete))
if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)))
goto out;
if (!ctx->submitter_task)
goto out;
@@ -2242,7 +2245,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
if (unlikely(!ctx->poll_activated))
if (unlikely(!(data_race(ctx->int_flags) & IO_RING_F_POLL_ACTIVATED)))
io_activate_pollwq(ctx);
/*
* provides mb() which pairs with barrier from wq_has_sleeper
@@ -2308,6 +2311,10 @@ static __cold void io_ring_exit_work(struct work_struct *work)
struct io_tctx_node *node;
int ret;
mutex_lock(&ctx->uring_lock);
io_terminate_zcrx(ctx);
mutex_unlock(&ctx->uring_lock);
/*
* If we're doing polled IO and end up having requests being
* submitted async (out-of-line), then completions can come in while
@@ -2539,6 +2546,40 @@ uaccess_end:
#endif
}
/*
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
* true, then the registered index is used. Otherwise, the normal fd table.
* Caller must call fput() on the returned file if it isn't a registered file,
* unless it's an ERR_PTR.
*/
struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
{
struct file *file;
if (registered) {
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return ERR_PTR(-EINVAL);
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
} else {
file = fget(fd);
}
if (unlikely(!file))
return ERR_PTR(-EBADF);
if (io_is_uring_fops(file))
return file;
fput(file);
return ERR_PTR(-EOPNOTSUPP);
}
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const void __user *, argp,
size_t, argsz)
@@ -2550,28 +2591,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(flags & ~IORING_ENTER_FLAGS))
return -EINVAL;
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
if (flags & IORING_ENTER_REGISTERED_RING) {
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (unlikely(!io_is_uring_fops(file)))
goto out;
}
file = io_uring_ctx_get_file(fd, flags & IORING_ENTER_REGISTERED_RING);
if (IS_ERR(file))
return PTR_ERR(file);
ctx = file->private_data;
ret = -EBADFD;
/*
@@ -2581,6 +2603,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED))
goto out;
if (io_has_loop_ops(ctx)) {
ret = io_run_loop(ctx);
goto out;
}
/*
* For SQ polling, the thread will do all submissions and completions.
* Just return the requested submit count, and wake the thread if
@@ -2610,7 +2637,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
if (ctx->syscall_iopoll)
if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)
goto iopoll_locked;
/*
* Ignore errors, we'll soon call io_cqring_wait() and
@@ -2625,7 +2652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (flags & IORING_ENTER_GETEVENTS) {
int ret2;
if (ctx->syscall_iopoll) {
if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) {
/*
* We disallow the app entering submit/complete with
* polling, but we still need to lock the ring to
@@ -2926,9 +2953,9 @@ static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
if (dst->bpf_filters)
WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
if (dst->op_registered)
ctx->op_restricted = 1;
ctx->int_flags |= IO_RING_F_OP_RESTRICTED;
if (dst->reg_registered)
ctx->reg_restricted = 1;
ctx->int_flags |= IO_RING_F_REG_RESTRICTED;
}
static __cold int io_uring_create(struct io_ctx_config *config)
@@ -2955,17 +2982,18 @@ static __cold int io_uring_create(struct io_ctx_config *config)
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
!(ctx->flags & IORING_SETUP_IOPOLL))
ctx->task_complete = true;
ctx->int_flags |= IO_RING_F_TASK_COMPLETE;
if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
ctx->lockless_cq = true;
if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) ||
(ctx->flags & IORING_SETUP_IOPOLL))
ctx->int_flags |= IO_RING_F_LOCKLESS_CQ;
/*
* lazy poll_wq activation relies on ->task_complete for synchronisation
* purposes, see io_activate_pollwq()
*/
if (!ctx->task_complete)
ctx->poll_activated = true;
if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))
ctx->int_flags |= IO_RING_F_POLL_ACTIVATED;
/*
* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
@@ -2975,9 +3003,10 @@ static __cold int io_uring_create(struct io_ctx_config *config)
*/
if (ctx->flags & IORING_SETUP_IOPOLL &&
!(ctx->flags & IORING_SETUP_SQPOLL))
ctx->syscall_iopoll = 1;
ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL;
ctx->compat = in_compat_syscall();
if (in_compat_syscall())
ctx->int_flags |= IO_RING_F_COMPAT;
if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
ctx->user = get_uid(current_user());

View File

@@ -185,6 +185,7 @@ void io_req_track_inflight(struct io_kiocb *req);
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
struct file *io_uring_ctx_get_file(unsigned int fd, bool registered);
void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw);
@@ -223,7 +224,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
if (ctx->flags & IORING_SETUP_IOPOLL) {
lockdep_assert_held(&ctx->uring_lock);
} else if (!ctx->task_complete) {
} else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) {
lockdep_assert_held(&ctx->completion_lock);
} else if (ctx->submitter_task) {
/*
@@ -240,7 +241,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
static inline bool io_is_compat(struct io_ring_ctx *ctx)
{
return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat);
return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT);
}
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
@@ -494,10 +495,12 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}
#define SHOULD_FLUSH_MASK (IO_RING_F_OFF_TIMEOUT_USED | \
IO_RING_F_HAS_EVFD | IO_RING_F_POLL_ACTIVATED)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (unlikely(ctx->off_timeout_used ||
ctx->has_evfd || ctx->poll_activated))
if (unlikely(data_race(ctx->int_flags) & SHOULD_FLUSH_MASK))
__io_commit_cqring_flush(ctx);
}

View File

@@ -230,7 +230,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
struct io_br_sel sel = { };
struct io_buffer_list *bl;
io_ring_submit_lock(req->ctx, issue_flags);
io_ring_submit_lock(ctx, issue_flags);
bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
@@ -239,7 +239,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
else
sel.addr = io_provided_buffer_select(req, len, bl);
}
io_ring_submit_unlock(req->ctx, issue_flags);
io_ring_submit_unlock(ctx, issue_flags);
return sel;
}

91
io_uring/loop.c Normal file
View File

@@ -0,0 +1,91 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include "io_uring.h"
#include "wait.h"
#include "loop.h"
static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx,
const struct iou_loop_params *lp)
{
return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail);
}
static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait)
{
atomic_set(&ctx->cq_wait_nr, nr_wait);
set_current_state(TASK_INTERRUPTIBLE);
}
static inline void io_loop_wait_finish(struct io_ring_ctx *ctx)
{
__set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
}
static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp,
unsigned nr_wait)
{
io_loop_wait_start(ctx, nr_wait);
if (unlikely(io_local_work_pending(ctx) ||
io_loop_nr_cqes(ctx, lp) <= 0) ||
READ_ONCE(ctx->check_cq)) {
io_loop_wait_finish(ctx);
return;
}
mutex_unlock(&ctx->uring_lock);
schedule();
io_loop_wait_finish(ctx);
mutex_lock(&ctx->uring_lock);
}
static int __io_run_loop(struct io_ring_ctx *ctx)
{
struct iou_loop_params lp = {};
while (true) {
int nr_wait, step_res;
if (unlikely(!ctx->loop_step))
return -EFAULT;
step_res = ctx->loop_step(ctx, &lp);
if (step_res == IOU_LOOP_STOP)
break;
if (step_res != IOU_LOOP_CONTINUE)
return -EINVAL;
nr_wait = io_loop_nr_cqes(ctx, &lp);
if (nr_wait > 0)
io_loop_wait(ctx, &lp, nr_wait);
else
nr_wait = 0;
if (task_work_pending(current)) {
mutex_unlock(&ctx->uring_lock);
io_run_task_work();
mutex_lock(&ctx->uring_lock);
}
if (unlikely(task_sigpending(current)))
return -EINTR;
io_run_local_work_locked(ctx, nr_wait);
if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
io_cqring_overflow_flush_locked(ctx);
}
return 0;
}
int io_run_loop(struct io_ring_ctx *ctx)
{
int ret;
if (!io_allowed_run_tw(ctx))
return -EEXIST;
mutex_lock(&ctx->uring_lock);
ret = __io_run_loop(ctx);
mutex_unlock(&ctx->uring_lock);
return ret;
}

27
io_uring/loop.h Normal file
View File

@@ -0,0 +1,27 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_LOOP_H
#define IOU_LOOP_H
#include <linux/io_uring_types.h>
struct iou_loop_params {
/*
* The CQE index to wait for. Only serves as a hint and can still be
* woken up earlier.
*/
__u32 cq_wait_idx;
};
enum {
IOU_LOOP_CONTINUE = 0,
IOU_LOOP_STOP,
};
static inline bool io_has_loop_ops(struct io_ring_ctx *ctx)
{
return data_race(ctx->loop_step);
}
int io_run_loop(struct io_ring_ctx *ctx);
#endif

View File

@@ -67,7 +67,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req)
static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
{
return target_ctx->task_complete;
return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE;
}
static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw)

View File

@@ -375,10 +375,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kmsg->msg.msg_namelen = addr_len;
}
if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
if (sr->flags & IORING_SEND_VECTORIZED)
return -EINVAL;
req->flags |= REQ_F_IMPORT_BUFFER;
return 0;
if (!(sr->flags & IORING_SEND_VECTORIZED)) {
req->flags |= REQ_F_IMPORT_BUFFER;
return 0;
}
kmsg->msg.msg_iter.nr_segs = sr->len;
return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len);
}
if (req->flags & REQ_F_BUFFER_SELECT)
return 0;
@@ -396,6 +399,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
struct user_msghdr msg;
int ret;
sr->flags |= IORING_SEND_VECTORIZED;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
if (unlikely(ret))
@@ -1333,11 +1337,12 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_ring_ctx *ctx = req->ctx;
struct io_async_msghdr *iomsg;
struct io_kiocb *notif;
u64 user_data;
int ret;
zc->done_io = 0;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
if (unlikely(READ_ONCE(sqe->__pad2[0])))
return -EINVAL;
/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
if (req->flags & REQ_F_CQE_SKIP)
@@ -1346,7 +1351,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
notif = zc->notif = io_alloc_notif(ctx);
if (!notif)
return -ENOMEM;
notif->cqe.user_data = req->cqe.user_data;
user_data = READ_ONCE(sqe->addr3);
if (!user_data)
user_data = req->cqe.user_data;
notif->cqe.user_data = user_data;
notif->cqe.res = 0;
notif->cqe.flags = IORING_CQE_F_NOTIF;
req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
@@ -1370,7 +1379,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
if (io_is_compat(req->ctx))
if (io_is_compat(ctx))
zc->msg_flags |= MSG_CMSG_COMPAT;
iomsg = io_msg_alloc_async(req);
@@ -1445,22 +1454,39 @@ static int io_sg_from_iter(struct sk_buff *skb,
return ret;
}
static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
static int io_send_zc_import(struct io_kiocb *req,
struct io_async_msghdr *kmsg,
unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
struct io_kiocb *notif = sr->notif;
int ret;
WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
sr->notif->buf_index = req->buf_index;
return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter,
(u64)(uintptr_t)sr->buf, sr->len,
ITER_SOURCE, issue_flags);
notif->buf_index = req->buf_index;
if (!(sr->flags & IORING_SEND_VECTORIZED)) {
ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter,
(u64)(uintptr_t)sr->buf, sr->len,
ITER_SOURCE, issue_flags);
} else {
unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
notif, &kmsg->vec, uvec_segs,
issue_flags);
}
if (unlikely(ret))
return ret;
req->flags &= ~REQ_F_IMPORT_BUFFER;
return 0;
}
int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned msg_flags;
@@ -1471,106 +1497,38 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
return -EOPNOTSUPP;
if (!(req->flags & REQ_F_POLLED) &&
(zc->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
if (req->flags & REQ_F_IMPORT_BUFFER) {
req->flags &= ~REQ_F_IMPORT_BUFFER;
ret = io_send_zc_import(req, issue_flags);
if (unlikely(ret))
return ret;
}
msg_flags = zc->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
kmsg->msg.msg_flags = msg_flags;
kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
ret = sock_sendmsg(sock, &kmsg->msg);
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
zc->done_io += ret;
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
}
if (ret >= 0)
ret += zc->done_io;
else if (zc->done_io)
ret = zc->done_io;
/*
* If we're in io-wq we can't rely on tw ordering guarantees, defer
* flushing notif to io_send_zc_cleanup()
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(zc->notif);
zc->notif = NULL;
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_COMPLETE;
}
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
if (req->flags & REQ_F_IMPORT_BUFFER) {
unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
int ret;
sr->notif->buf_index = req->buf_index;
ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
sr->notif, &kmsg->vec, uvec_segs,
issue_flags);
if (unlikely(ret))
return ret;
req->flags &= ~REQ_F_IMPORT_BUFFER;
}
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
return -EOPNOTSUPP;
if (!(req->flags & REQ_F_POLLED) &&
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
flags = sr->msg_flags;
if (req->flags & REQ_F_IMPORT_BUFFER) {
ret = io_send_zc_import(req, kmsg, issue_flags);
if (unlikely(ret))
return ret;
}
msg_flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
kmsg->msg.msg_control_user = sr->msg_control;
kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (req->opcode == IORING_OP_SEND_ZC) {
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
kmsg->msg.msg_flags = msg_flags;
ret = sock_sendmsg(sock, &kmsg->msg);
} else {
kmsg->msg.msg_control_user = sr->msg_control;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags);
}
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
return -EAGAIN;
if (ret > 0 && io_net_retry(sock, flags)) {
if (ret > 0 && io_net_retry(sock, sr->msg_flags)) {
sr->done_io += ret;
return -EAGAIN;
}

View File

@@ -50,7 +50,6 @@ void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req);

View File

@@ -67,7 +67,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_readv,
@@ -82,7 +81,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_writev,
@@ -102,7 +100,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_read_fixed,
.issue = io_read_fixed,
@@ -116,7 +113,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_write_fixed,
.issue = io_write_fixed,
@@ -250,7 +246,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_read,
.issue = io_read,
@@ -264,7 +259,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_write,
.issue = io_write,
@@ -423,7 +417,6 @@ const struct io_issue_def io_issue_defs[] = {
.needs_file = 1,
.plug = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
@@ -437,7 +430,7 @@ const struct io_issue_def io_issue_defs[] = {
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep,
.issue = io_send_zc,
.issue = io_sendmsg_zc,
#else
.prep = io_eopnotsupp_prep,
#endif
@@ -556,7 +549,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_readv_fixed,
@@ -571,7 +563,6 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.async_size = sizeof(struct io_async_rw),
.prep = io_prep_writev_fixed,
@@ -593,7 +584,6 @@ const struct io_issue_def io_issue_defs[] = {
.needs_file = 1,
.plug = 1,
.iopoll = 1,
.iopoll_queue = 1,
.is_128 = 1,
.async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,

View File

@@ -25,8 +25,6 @@ struct io_issue_def {
unsigned poll_exclusive : 1;
/* skip auditing */
unsigned audit_skip : 1;
/* have to be put into the iopoll list */
unsigned iopoll_queue : 1;
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
unsigned vectored : 1;
/* set to 1 if this opcode uses 128b sqes in a mixed sq */

View File

@@ -277,8 +277,10 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
/* the mask was stashed in __io_poll_execute */
if (!req->cqe.res) {
struct poll_table_struct pt = { ._key = req->apoll_events };
req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
__poll_t events = req->apoll_events;
struct poll_table_struct pt = { ._key = events };
req->cqe.res = vfs_poll(req->file, &pt) & events;
/*
* We got woken with a mask, but someone else got to
* it first. The above vfs_poll() doesn't add us back
@@ -287,7 +289,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
*/
if (unlikely(!req->cqe.res)) {
/* Multishot armed need not reissue */
if (!(req->apoll_events & EPOLLONESHOT))
if (!(events & EPOLLONESHOT))
continue;
return IOU_POLL_REISSUE;
}

View File

@@ -34,12 +34,12 @@ static ssize_t io_query_zcrx(union io_query_data *data)
{
struct io_uring_query_zcrx *e = &data->zcrx;
e->register_flags = ZCRX_REG_IMPORT;
e->register_flags = ZCRX_SUPPORTED_REG_FLAGS;
e->area_flags = IORING_ZCRX_AREA_DMABUF;
e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST;
e->rq_hdr_size = sizeof(struct io_uring);
e->rq_hdr_alignment = L1_CACHE_BYTES;
e->features = ZCRX_FEATURE_RX_PAGE_SIZE;
e->features = ZCRX_FEATURES;
e->__resv2 = 0;
return sizeof(*e);
}

View File

@@ -192,9 +192,9 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
return ret;
}
if (ctx->restrictions.op_registered)
ctx->op_restricted = 1;
ctx->int_flags |= IO_RING_F_OP_RESTRICTED;
if (ctx->restrictions.reg_registered)
ctx->reg_restricted = 1;
ctx->int_flags |= IO_RING_F_REG_RESTRICTED;
return 0;
}
@@ -392,7 +392,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i])
ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
@@ -733,7 +733,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
if (ctx->submitter_task && ctx->submitter_task != current)
return -EEXIST;
if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
@@ -908,7 +908,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_zcrx_ifq(ctx, arg);
ret = io_register_zcrx(ctx, arg);
break;
case IORING_REGISTER_RESIZE_RINGS:
ret = -EINVAL;
@@ -946,40 +946,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return ret;
}
/*
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
* true, then the registered index is used. Otherwise, the normal fd table.
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
*/
struct file *io_uring_register_get_file(unsigned int fd, bool registered)
{
struct file *file;
if (registered) {
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return ERR_PTR(-EINVAL);
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
if (file)
get_file(file);
} else {
file = fget(fd);
}
if (unlikely(!file))
return ERR_PTR(-EBADF);
if (io_is_uring_fops(file))
return file;
fput(file);
return ERR_PTR(-EOPNOTSUPP);
}
static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
{
struct io_uring_sqe sqe;
@@ -1034,7 +1000,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (fd == -1)
return io_uring_register_blind(opcode, arg, nr_args);
file = io_uring_register_get_file(fd, use_registered_ring);
file = io_uring_ctx_get_file(fd, use_registered_ring);
if (IS_ERR(file))
return PTR_ERR(file);
ctx = file->private_data;
@@ -1046,6 +1012,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
ctx->buf_table.nr, ret);
mutex_unlock(&ctx->uring_lock);
fput(file);
if (!use_registered_ring)
fput(file);
return ret;
}

View File

@@ -4,6 +4,5 @@
int io_eventfd_unregister(struct io_ring_ctx *ctx);
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
struct file *io_uring_register_get_file(unsigned int fd, bool registered);
#endif

View File

@@ -295,7 +295,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
u64 tag = 0;
uvec = u64_to_user_ptr(user_data);
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
if (IS_ERR(iov)) {
err = PTR_ERR(iov);
break;
@@ -319,7 +319,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
io_reset_rsrc_node(ctx, &ctx->buf_table, i);
ctx->buf_table.nodes[i] = node;
if (ctx->compat)
if (io_is_compat(ctx))
user_data += sizeof(struct compat_iovec);
else
user_data += sizeof(struct iovec);
@@ -883,12 +883,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
if (arg) {
uvec = (struct iovec __user *) arg;
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
if (IS_ERR(iov)) {
ret = PTR_ERR(iov);
break;
}
if (ctx->compat)
if (io_is_compat(ctx))
arg += sizeof(struct compat_iovec);
else
arg += sizeof(struct iovec);
@@ -961,7 +961,7 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
*/
imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
if (!imu) {
kfree(node);
io_cache_free(&ctx->node_cache, node);
ret = -ENOMEM;
goto unlock;
}
@@ -1273,7 +1273,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
file = io_uring_register_get_file(buf.src_fd, registered_src);
file = io_uring_ctx_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -1295,7 +1295,8 @@ out:
if (src_ctx != ctx)
mutex_unlock(&src_ctx->uring_lock);
fput(file);
if (!registered_src)
fput(file);
return ret;
}

View File

@@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
!(ctx->flags & IORING_SETUP_IOPOLL)))
!(req->flags & REQ_F_IOPOLL)))
return false;
/*
* If ref is dying, we might be running poll reap from the exit work.
@@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
}
}
if (req->ctx->flags & IORING_SETUP_IOPOLL)
if (req->flags & REQ_F_IOPOLL)
io_complete_rw_iopoll(&rw->kiocb, ret);
else
io_complete_rw(&rw->kiocb, ret);
@@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos;
if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) {
u32 cflags = 0;
__io_complete_rw_common(req, ret);
@@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
req->flags |= REQ_F_IOPOLL;
kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI;
req->iopoll_completed = 0;
@@ -899,7 +900,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
* We have a union of meta fields with wpq used for buffered-io
* in io_async_rw, so fail it here.
*/
if (!(req->file->f_flags & O_DIRECT))
if (!(file->f_flags & O_DIRECT))
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HAS_METADATA;
kiocb->private = &io->meta;
@@ -961,13 +962,13 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
if (ret == -EAGAIN) {
/* If we can poll, just do that. */
if (io_file_can_poll(req))
return -EAGAIN;
return ret;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
if (!force_nonblock && !(req->flags & REQ_F_IOPOLL))
return ret;
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
goto done;
return ret;
ret = 0;
} else if (ret == -EIOCBQUEUED) {
return IOU_ISSUE_SKIP_COMPLETE;
@@ -975,7 +976,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
(issue_flags & IO_URING_F_MULTISHOT)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
return ret;
}
/*
@@ -1018,8 +1019,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
kiocb->ki_flags &= ~IOCB_WAITQ;
iov_iter_restore(&io->iter, &io->iter_state);
} while (ret > 0);
done:
/* it's faster to check here than delegate to kfree */
return ret;
}
@@ -1188,7 +1188,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL))
goto ret_eagain;
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {

View File

@@ -458,6 +458,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
return -EINVAL;
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_uring_task *tctx;
struct task_struct *tsk;
struct io_sq_data *sqd;
bool attached;
@@ -524,8 +525,13 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
rcu_assign_pointer(sqd->thread, tsk);
mutex_unlock(&sqd->lock);
ret = 0;
get_task_struct(tsk);
ret = io_uring_alloc_task_context(tsk, ctx);
tctx = io_uring_alloc_task_context(tsk, ctx);
if (!IS_ERR(tctx))
tsk->io_uring = tctx;
else
ret = PTR_ERR(tctx);
wake_up_new_task(tsk);
if (ret)
goto err;

View File

@@ -74,20 +74,20 @@ void __io_uring_free(struct task_struct *tsk)
}
}
__cold int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx)
__cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
tctx = kzalloc_obj(*tctx);
if (unlikely(!tctx))
return -ENOMEM;
return ERR_PTR(-ENOMEM);
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
kfree(tctx);
return ret;
return ERR_PTR(ret);
}
tctx->io_wq = io_init_wq_offload(ctx, task);
@@ -95,7 +95,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
return ret;
return ERR_PTR(ret);
}
tctx->task = task;
@@ -103,31 +103,56 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_cancel, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
init_llist_head(&tctx->task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return tctx;
}
static int io_tctx_install_node(struct io_ring_ctx *ctx,
struct io_uring_task *tctx)
{
struct io_tctx_node *node;
int ret;
if (xa_load(&tctx->xa, (unsigned long)ctx))
return 0;
node = kmalloc_obj(*node);
if (!node)
return -ENOMEM;
node->ctx = ctx;
node->task = current;
ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
node, GFP_KERNEL));
if (ret) {
kfree(node);
return ret;
}
mutex_lock(&ctx->tctx_lock);
list_add(&node->ctx_node, &ctx->tctx_list);
mutex_unlock(&ctx->tctx_lock);
return 0;
}
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
int ret;
if (unlikely(!tctx)) {
ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
tctx = io_uring_alloc_task_context(current, ctx);
if (IS_ERR(tctx))
return PTR_ERR(tctx);
tctx = current->io_uring;
if (ctx->iowq_limits_set) {
if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
unsigned int limits[2] = { ctx->iowq_limits[0],
ctx->iowq_limits[1], };
ret = io_wq_max_workers(tctx->io_wq, limits);
if (ret)
return ret;
goto err_free;
}
}
@@ -138,25 +163,19 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
*/
if (tctx->io_wq)
io_wq_set_exit_on_idle(tctx->io_wq, false);
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
node = kmalloc_obj(*node);
if (!node)
return -ENOMEM;
node->ctx = ctx;
node->task = current;
ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
node, GFP_KERNEL));
if (ret) {
kfree(node);
return ret;
}
mutex_lock(&ctx->tctx_lock);
list_add(&node->ctx_node, &ctx->tctx_list);
mutex_unlock(&ctx->tctx_lock);
ret = io_tctx_install_node(ctx, tctx);
if (!ret) {
current->io_uring = tctx;
return 0;
}
return 0;
if (!current->io_uring) {
err_free:
io_wq_put_and_exit(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
}
return ret;
}
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)

View File

@@ -6,8 +6,8 @@ struct io_tctx_node {
struct io_ring_ctx *ctx;
};
int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);
struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);
void io_uring_del_tctx_node(unsigned long index);
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx);

View File

@@ -30,11 +30,30 @@ struct io_timeout_rem {
u64 addr;
/* timeout update */
struct timespec64 ts;
ktime_t time;
u32 flags;
bool ltimeout;
};
static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags)
{
struct timespec64 ts;
if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) {
*time = ns_to_ktime(arg);
if (*time < 0)
return -EINVAL;
return 0;
}
if (get_timespec64(&ts, u64_to_user_ptr(arg)))
return -EFAULT;
if (ts.tv_sec < 0 || ts.tv_nsec < 0)
return -EINVAL;
*time = timespec64_to_ktime(ts);
return 0;
}
static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link);
@@ -80,7 +99,7 @@ static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw)
/* re-arm timer */
raw_spin_lock_irq(&ctx->timeout_lock);
list_add(&timeout->list, ctx->timeout_list.prev);
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
hrtimer_start(&data->timer, data->time, data->mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
return;
}
@@ -265,8 +284,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
raw_spin_lock_irqsave(&ctx->timeout_lock, flags);
list_del_init(&timeout->list);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
atomic_set(&ctx->cq_timeouts,
atomic_read(&ctx->cq_timeouts) + 1);
raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags);
if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
@@ -395,7 +414,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
}
static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
ktime_t ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
struct io_timeout_data *io;
@@ -417,12 +436,12 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
if (hrtimer_try_to_cancel(&io->timer) == -1)
return -EALREADY;
hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode);
hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
hrtimer_start(&io->timer, ts, mode);
return 0;
}
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
ktime_t time, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
@@ -435,20 +454,23 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
timeout->off = 0; /* noseq */
data = req->async_data;
data->ts = *ts;
data->time = time;
list_add_tail(&timeout->list, &ctx->timeout_list);
hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode);
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode);
hrtimer_start(&data->timer, data->time, mode);
return 0;
}
int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem);
int ret;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->addr3 || sqe->__pad2[0])
return -EINVAL;
if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
@@ -460,12 +482,13 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
tr->ltimeout = true;
if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2))))
return -EFAULT;
if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK |
IORING_TIMEOUT_ABS |
IORING_TIMEOUT_IMMEDIATE_ARG))
return -EINVAL;
ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags);
if (ret)
return ret;
} else if (tr->flags) {
/* timeout removal doesn't support flags */
return -EINVAL;
@@ -500,9 +523,9 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
raw_spin_lock_irq(&ctx->timeout_lock);
if (tr->ltimeout)
ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
ret = io_linked_timeout_update(ctx, tr->addr, tr->time, mode);
else
ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
ret = io_timeout_update(ctx, tr->addr, tr->time, mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
}
@@ -520,7 +543,10 @@ static int __io_timeout_prep(struct io_kiocb *req,
struct io_timeout_data *data;
unsigned flags;
u32 off = READ_ONCE(sqe->off);
int ret;
if (sqe->addr3 || sqe->__pad2[0])
return -EINVAL;
if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
@@ -528,7 +554,8 @@ static int __io_timeout_prep(struct io_kiocb *req,
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
IORING_TIMEOUT_ETIME_SUCCESS |
IORING_TIMEOUT_MULTISHOT))
IORING_TIMEOUT_MULTISHOT |
IORING_TIMEOUT_IMMEDIATE_ARG))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -539,8 +566,8 @@ static int __io_timeout_prep(struct io_kiocb *req,
INIT_LIST_HEAD(&timeout->list);
timeout->off = off;
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED)))
req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED;
/*
* for multishot reqs w/ fixed nr of repeats, repeats tracks the
* remaining nr
@@ -557,11 +584,9 @@ static int __io_timeout_prep(struct io_kiocb *req,
data->req = req;
data->flags = flags;
if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr))))
return -EFAULT;
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
return -EINVAL;
ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags);
if (ret)
return ret;
data->mode = io_translate_timeout_mode(flags);
@@ -637,7 +662,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
}
add:
list_add(&timeout->list, entry);
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
hrtimer_start(&data->timer, data->time, data->mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
return IOU_ISSUE_SKIP_COMPLETE;
}
@@ -655,8 +680,7 @@ void io_queue_linked_timeout(struct io_kiocb *req)
if (timeout->head) {
struct io_timeout_data *data = req->async_data;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
hrtimer_start(&data->timer, data->time, data->mode);
list_add_tail(&timeout->list, &ctx->ltimeout_list);
}
raw_spin_unlock_irq(&ctx->timeout_lock);

View File

@@ -3,7 +3,7 @@
struct io_timeout_data {
struct io_kiocb *req;
struct hrtimer timer;
struct timespec64 ts;
ktime_t time;
enum hrtimer_mode mode;
u32 flags;
};

View File

@@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (!head) {
io_ctx_mark_taskrun(ctx);
if (ctx->has_evfd)
if (data_race(ctx->int_flags) & IO_RING_F_HAS_EVFD)
io_eventfd_signal(ctx, false);
}

View File

@@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
* because iopoll completion data overlaps with the hash_node used
* for tracking.
*/
if (ctx->flags & IORING_SETUP_IOPOLL)
if (req->flags & REQ_F_IOPOLL)
return;
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
@@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
io_req_set_cqe32_extra(req, res2, 0);
}
io_req_uring_cleanup(req, issue_flags);
if (req->ctx->flags & IORING_SETUP_IOPOLL) {
if (req->flags & REQ_F_IOPOLL) {
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
smp_store_release(&req->iopoll_completed, 1);
} else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
@@ -257,9 +257,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
issue_flags |= IO_URING_F_CQE32;
if (io_is_compat(ctx))
issue_flags |= IO_URING_F_COMPAT;
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!file->f_op->uring_cmd_iopoll)
return -EOPNOTSUPP;
if (ctx->flags & IORING_SETUP_IOPOLL && file->f_op->uring_cmd_iopoll) {
req->flags |= REQ_F_IOPOLL;
issue_flags |= IO_URING_F_IOPOLL;
req->iopoll_completed = 0;
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {

View File

@@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
struct ext_arg *ext_arg);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx);
void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx);
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{

View File

@@ -63,7 +63,7 @@ static int io_area_max_shift(struct io_zcrx_mem *mem)
unsigned i;
for_each_sgtable_dma_sg(sgt, sg, i)
shift = min(shift, __ffs(sg->length));
shift = min(shift, __ffs(sg_dma_len(sg)));
return shift;
}
@@ -127,10 +127,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
int dmabuf_fd = area_reg->dmabuf_fd;
int i, ret;
if (!ifq->dev)
return -EINVAL;
if (off)
return -EINVAL;
if (WARN_ON_ONCE(!ifq->dev))
return -EFAULT;
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
return -EINVAL;
@@ -194,6 +194,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
{
struct page **pages;
int nr_pages, ret;
bool mapped = false;
if (area_reg->dmabuf_fd)
return -EINVAL;
@@ -207,22 +208,37 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
0, (unsigned long)nr_pages << PAGE_SHIFT,
GFP_KERNEL_ACCOUNT);
if (ret) {
unpin_user_pages(pages, nr_pages);
kvfree(pages);
return ret;
if (ret)
goto out_err;
if (ifq->dev) {
ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table,
DMA_FROM_DEVICE, IO_DMA_ATTR);
if (ret < 0)
goto out_err;
mapped = true;
}
mem->account_pages = io_count_account_pages(pages, nr_pages);
ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages);
if (ret < 0)
if (ret < 0) {
mem->account_pages = 0;
goto out_err;
}
mem->sgt = &mem->page_sg_table;
mem->pages = pages;
mem->nr_folios = nr_pages;
mem->size = area_reg->len;
return ret;
out_err:
if (mapped)
dma_unmap_sgtable(ifq->dev, &mem->page_sg_table,
DMA_FROM_DEVICE, IO_DMA_ATTR);
sg_free_table(&mem->page_sg_table);
unpin_user_pages(pages, nr_pages);
kvfree(pages);
return ret;
}
static void io_release_area_mem(struct io_zcrx_mem *mem)
@@ -273,8 +289,10 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
return;
area->is_mapped = false;
for (i = 0; i < area->nia.num_niovs; i++)
net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
if (area->nia.niovs) {
for (i = 0; i < area->nia.num_niovs; i++)
net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
}
if (area->mem.is_dmabuf) {
io_release_dmabuf(&area->mem);
@@ -284,45 +302,23 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
}
}
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
int ret;
guard(mutex)(&ifq->pp_lock);
if (area->is_mapped)
return 0;
if (!area->mem.is_dmabuf) {
ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
DMA_FROM_DEVICE, IO_DMA_ATTR);
if (ret < 0)
return ret;
}
ret = io_populate_area_dma(ifq, area);
if (ret && !area->mem.is_dmabuf)
dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
DMA_FROM_DEVICE, IO_DMA_ATTR);
if (ret == 0)
area->is_mapped = true;
return ret;
}
static void io_zcrx_sync_for_device(struct page_pool *pool,
struct net_iov *niov)
static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx,
netmem_ref *netmems, unsigned nr)
{
#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
struct device *dev = pp->p.dev;
unsigned i, niov_size;
dma_addr_t dma_addr;
unsigned niov_size;
if (!dma_dev_need_sync(pool->p.dev))
if (!dma_dev_need_sync(dev))
return;
niov_size = 1U << zcrx->niov_shift;
niov_size = 1U << io_pp_to_ifq(pool)->niov_shift;
dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
niov_size, pool->p.dma_dir);
for (i = 0; i < nr; i++) {
dma_addr = page_pool_get_dma_addr_netmem(netmems[i]);
__dma_sync_single_for_device(dev, dma_addr + pp->p.offset,
niov_size, pp->p.dma_dir);
}
#endif
}
@@ -390,24 +386,24 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
return -EINVAL;
mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
mmap_offset += id << IORING_OFF_PBUF_SHIFT;
mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT;
ret = io_create_region(ctx, &ifq->region, rd, mmap_offset);
ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset);
if (ret < 0)
return ret;
ptr = io_region_get_ptr(&ifq->region);
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
ptr = io_region_get_ptr(&ifq->rq_region);
ifq->rq.ring = (struct io_uring *)ptr;
ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
return 0;
}
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
io_free_region(ifq->user, &ifq->region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
io_free_region(ifq->user, &ifq->rq_region);
ifq->rq.ring = NULL;
ifq->rq.rqes = NULL;
}
static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
@@ -429,8 +425,13 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area *area)
{
if (ifq->area)
bool kern_readable = !area->mem.is_dmabuf;
if (WARN_ON_ONCE(ifq->area))
return -EINVAL;
if (WARN_ON_ONCE(ifq->kern_readable != kern_readable))
return -EINVAL;
ifq->area = area;
return 0;
}
@@ -450,6 +451,8 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
return -EINVAL;
buf_size_shift = ilog2(reg->rx_buf_len);
}
if (!ifq->dev && buf_size_shift != PAGE_SHIFT)
return -EOPNOTSUPP;
ret = -ENOMEM;
area = kzalloc_obj(*area);
@@ -460,8 +463,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
ret = io_import_area(ifq, &area->mem, area_reg);
if (ret)
goto err;
if (ifq->dev)
area->is_mapped = true;
if (buf_size_shift > io_area_max_shift(&area->mem)) {
if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) {
ret = -ERANGE;
goto err;
}
@@ -495,6 +500,12 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
niov->type = NET_IOV_IOURING;
}
if (ifq->dev) {
ret = io_populate_area_dma(ifq, area);
if (ret)
goto err;
}
area->free_count = nr_iovs;
/* we're only supporting one area per ifq for now */
area->area_id = 0;
@@ -519,7 +530,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
return NULL;
ifq->if_rxq = -1;
spin_lock_init(&ifq->rq_lock);
spin_lock_init(&ifq->rq.lock);
mutex_init(&ifq->pp_lock);
refcount_set(&ifq->refs, 1);
refcount_set(&ifq->user_refs, 1);
@@ -586,9 +597,21 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
spin_lock_bh(&area->freelist_lock);
guard(spinlock_bh)(&area->freelist_lock);
area->freelist[area->free_count++] = net_iov_idx(niov);
spin_unlock_bh(&area->freelist_lock);
}
static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area)
{
unsigned niov_idx;
lockdep_assert_held(&area->freelist_lock);
if (unlikely(!area->free_count))
return NULL;
niov_idx = area->freelist[--area->free_count];
return &area->nia.niovs[niov_idx];
}
static void io_zcrx_return_niov(struct net_iov *niov)
@@ -624,12 +647,17 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
}
}
static void zcrx_unregister(struct io_zcrx_ifq *ifq)
static void zcrx_unregister_user(struct io_zcrx_ifq *ifq)
{
if (refcount_dec_and_test(&ifq->user_refs)) {
io_close_queue(ifq);
io_zcrx_scrub(ifq);
}
}
static void zcrx_unregister(struct io_zcrx_ifq *ifq)
{
zcrx_unregister_user(ifq);
io_put_zcrx_ifq(ifq);
}
@@ -640,7 +668,7 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
lockdep_assert_held(&ctx->mmap_lock);
return ifq ? &ifq->region : NULL;
return ifq ? &ifq->rq_region : NULL;
}
static int zcrx_box_release(struct inode *inode, struct file *file)
@@ -751,10 +779,50 @@ err:
return ret;
}
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
struct io_uring_zcrx_area_reg *area)
{
struct pp_memory_provider_params mp_param = {};
unsigned if_rxq = reg->if_rxq;
int ret;
ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns,
reg->if_idx);
if (!ifq->netdev)
return -ENODEV;
netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq);
if (!ifq->dev) {
ret = -EOPNOTSUPP;
goto netdev_put_unlock;
}
get_device(ifq->dev);
ret = io_zcrx_create_area(ifq, area, reg);
if (ret)
goto netdev_put_unlock;
if (reg->rx_buf_len)
mp_param.rx_page_size = 1U << ifq->niov_shift;
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
ret = __net_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL);
if (ret)
goto netdev_put_unlock;
ifq->if_rxq = if_rxq;
ret = 0;
netdev_put_unlock:
netdev_unlock(ifq->netdev);
return ret;
}
int io_register_zcrx(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
struct io_uring_zcrx_area_reg area;
struct io_uring_zcrx_ifq_reg reg;
struct io_uring_region_desc rd;
@@ -778,11 +846,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return -EFAULT;
if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || reg.zcrx_id)
return -EINVAL;
if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS)
return -EINVAL;
if (reg.flags & ZCRX_REG_IMPORT)
return import_zcrx(ctx, arg, &reg);
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
return -EFAULT;
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
if (reg.if_rxq == -1 || !reg.rq_entries)
return -EINVAL;
if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV))
return -EINVAL;
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
if (!(ctx->flags & IORING_SETUP_CLAMP))
@@ -806,7 +878,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
mmgrab(ctx->mm_account);
ifq->mm_account = ctx->mm_account;
}
ifq->rq_entries = reg.rq_entries;
ifq->rq.nr_entries = reg.rq_entries;
scoped_guard(mutex, &ctx->mmap_lock) {
/* preallocate id */
@@ -819,33 +891,17 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
if (ret)
goto err;
ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx);
if (!ifq->netdev) {
ret = -ENODEV;
goto err;
ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
if (!(reg.flags & ZCRX_REG_NODEV)) {
ret = zcrx_register_netdev(ifq, &reg, &area);
if (ret)
goto err;
} else {
ret = io_zcrx_create_area(ifq, &area, &reg);
if (ret)
goto err;
}
netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq);
if (!ifq->dev) {
ret = -EOPNOTSUPP;
goto netdev_put_unlock;
}
get_device(ifq->dev);
ret = io_zcrx_create_area(ifq, &area, &reg);
if (ret)
goto netdev_put_unlock;
if (reg.rx_buf_len)
mp_param.rx_page_size = 1U << ifq->niov_shift;
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
if (ret)
goto netdev_put_unlock;
netdev_unlock(ifq->netdev);
ifq->if_rxq = reg.if_rxq;
reg.zcrx_id = id;
@@ -865,8 +921,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
goto err;
}
return 0;
netdev_put_unlock:
netdev_unlock(ifq->netdev);
err:
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->zcrx_ctxs, id);
@@ -875,17 +929,37 @@ ifq_free:
return ret;
}
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id)
{
unsigned niov_idx;
lockdep_assert_held(&area->freelist_lock);
niov_idx = area->freelist[--area->free_count];
return &area->nia.niovs[niov_idx];
return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
}
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id)
{
xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
}
void io_terminate_zcrx(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
unsigned long id = 0;
lockdep_assert_held(&ctx->uring_lock);
while (1) {
scoped_guard(mutex, &ctx->mmap_lock)
ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
if (!ifq)
break;
if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id)))
break;
set_zcrx_entry_mark(ctx, id);
id++;
zcrx_unregister_user(ifq);
}
}
void io_unregister_zcrx(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
@@ -896,31 +970,35 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
unsigned long id = 0;
ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
if (ifq)
if (ifq) {
if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) {
ifq = NULL;
break;
}
xa_erase(&ctx->zcrx_ctxs, id);
}
}
if (!ifq)
break;
zcrx_unregister(ifq);
io_put_zcrx_ifq(ifq);
}
xa_destroy(&ctx->zcrx_ctxs);
}
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
static inline u32 zcrx_rq_entries(struct zcrx_rq *rq)
{
u32 entries;
entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
return min(entries, ifq->rq_entries);
entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head;
return min(entries, rq->nr_entries);
}
static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
unsigned mask)
static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask)
{
unsigned int idx = ifq->cached_rq_head++ & mask;
unsigned int idx = rq->cached_head++ & mask;
return &ifq->rqes[idx];
return &rq->rqes[idx];
}
static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
@@ -946,21 +1024,24 @@ static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
return true;
}
static void io_zcrx_ring_refill(struct page_pool *pp,
struct io_zcrx_ifq *ifq)
static unsigned io_zcrx_ring_refill(struct page_pool *pp,
struct io_zcrx_ifq *ifq,
netmem_ref *netmems, unsigned to_alloc)
{
unsigned int mask = ifq->rq_entries - 1;
struct zcrx_rq *rq = &ifq->rq;
unsigned int mask = rq->nr_entries - 1;
unsigned int entries;
unsigned allocated = 0;
guard(spinlock_bh)(&ifq->rq_lock);
guard(spinlock_bh)(&rq->lock);
entries = io_zcrx_rqring_entries(ifq);
entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL);
entries = zcrx_rq_entries(rq);
entries = min_t(unsigned, entries, to_alloc);
if (unlikely(!entries))
return;
return 0;
do {
struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
struct net_iov *niov;
netmem_ref netmem;
@@ -978,46 +1059,56 @@ static void io_zcrx_ring_refill(struct page_pool *pp,
continue;
}
io_zcrx_sync_for_device(pp, niov);
net_mp_netmem_place_in_cache(pp, netmem);
netmems[allocated] = netmem;
allocated++;
} while (--entries);
smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
smp_store_release(&rq->ring->head, rq->cached_head);
return allocated;
}
static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq,
netmem_ref *netmems, unsigned to_alloc)
{
struct io_zcrx_area *area = ifq->area;
unsigned allocated = 0;
spin_lock_bh(&area->freelist_lock);
while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
struct net_iov *niov = __io_zcrx_get_free_niov(area);
netmem_ref netmem = net_iov_to_netmem(niov);
guard(spinlock_bh)(&area->freelist_lock);
for (allocated = 0; allocated < to_alloc; allocated++) {
struct net_iov *niov = zcrx_get_free_niov(area);
if (!niov)
break;
net_mp_niov_set_page_pool(pp, niov);
io_zcrx_sync_for_device(pp, niov);
net_mp_netmem_place_in_cache(pp, netmem);
netmems[allocated] = net_iov_to_netmem(niov);
}
spin_unlock_bh(&area->freelist_lock);
return allocated;
}
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
{
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
netmem_ref *netmems = pp->alloc.cache;
unsigned to_alloc = PP_ALLOC_CACHE_REFILL;
unsigned allocated;
/* pp should already be ensuring that */
if (unlikely(pp->alloc.count))
if (WARN_ON_ONCE(pp->alloc.count))
return 0;
allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc);
if (likely(allocated))
goto out_return;
io_zcrx_ring_refill(pp, ifq);
if (likely(pp->alloc.count))
goto out_return;
io_zcrx_refill_slow(pp, ifq);
if (!pp->alloc.count)
allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
if (!allocated)
return 0;
out_return:
return pp->alloc.cache[--pp->alloc.count];
zcrx_sync_for_device(pp, ifq, netmems, allocated);
allocated--;
pp->alloc.count += allocated;
return netmems[allocated];
}
static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
@@ -1036,7 +1127,6 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
static int io_pp_zc_init(struct page_pool *pp)
{
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
int ret;
if (WARN_ON_ONCE(!ifq))
return -EINVAL;
@@ -1049,10 +1139,6 @@ static int io_pp_zc_init(struct page_pool *pp)
if (pp->p.dma_dir != DMA_FROM_DEVICE)
return -EOPNOTSUPP;
ret = io_zcrx_map_area(ifq, ifq->area);
if (ret)
return ret;
refcount_inc(&ifq->refs);
return 0;
}
@@ -1100,14 +1186,14 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
};
static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
struct io_zcrx_ifq *zcrx)
struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq)
{
unsigned int mask = zcrx->rq_entries - 1;
unsigned int mask = rq->nr_entries - 1;
unsigned int i;
nr = min(nr, io_zcrx_rqring_entries(zcrx));
nr = min(nr, zcrx_rq_entries(rq));
for (i = 0; i < nr; i++) {
struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
struct net_iov *niov;
if (!io_parse_rqe(rqe, zcrx, &niov))
@@ -1115,7 +1201,7 @@ static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
netmem_array[i] = net_iov_to_netmem(niov);
}
smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
smp_store_release(&rq->ring->head, rq->cached_head);
return i;
}
@@ -1149,8 +1235,10 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
return -EINVAL;
do {
scoped_guard(spinlock_bh, &zcrx->rq_lock) {
nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
struct zcrx_rq *rq = &zcrx->rq;
scoped_guard(spinlock_bh, &rq->lock) {
nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq);
zcrx_return_buffers(netmems, nr);
}
@@ -1159,7 +1247,7 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
if (fatal_signal_pending(current))
break;
cond_resched();
} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries);
return 0;
}
@@ -1169,6 +1257,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
struct zcrx_ctrl ctrl;
struct io_zcrx_ifq *zcrx;
BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
if (nr_args)
return -EINVAL;
if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
@@ -1221,13 +1311,11 @@ static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq)
struct io_zcrx_area *area = ifq->area;
struct net_iov *niov = NULL;
if (area->mem.is_dmabuf)
if (!ifq->kern_readable)
return NULL;
spin_lock_bh(&area->freelist_lock);
if (area->free_count)
niov = __io_zcrx_get_free_niov(area);
spin_unlock_bh(&area->freelist_lock);
scoped_guard(spinlock_bh, &area->freelist_lock)
niov = zcrx_get_free_niov(area);
if (niov)
page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);

View File

@@ -8,6 +8,9 @@
#include <net/page_pool/types.h>
#include <net/net_trackers.h>
#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE)
struct io_zcrx_mem {
unsigned long size;
bool is_dmabuf;
@@ -38,17 +41,22 @@ struct io_zcrx_area {
struct io_zcrx_mem mem;
};
struct zcrx_rq {
spinlock_t lock;
struct io_uring *ring;
struct io_uring_zcrx_rqe *rqes;
u32 cached_head;
u32 nr_entries;
};
struct io_zcrx_ifq {
struct io_zcrx_area *area;
unsigned niov_shift;
struct user_struct *user;
struct mm_struct *mm_account;
bool kern_readable;
spinlock_t rq_lock ____cacheline_aligned_in_smp;
struct io_uring *rq_ring;
struct io_uring_zcrx_rqe *rqes;
u32 cached_rq_head;
u32 rq_entries;
struct zcrx_rq rq ____cacheline_aligned_in_smp;
u32 if_rxq;
struct device *dev;
@@ -63,26 +71,30 @@ struct io_zcrx_ifq {
* net stack.
*/
struct mutex pp_lock;
struct io_mapped_region region;
struct io_mapped_region rq_region;
};
#if defined(CONFIG_IO_URING_ZCRX)
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
int io_register_zcrx(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
void io_unregister_zcrx(struct io_ring_ctx *ctx);
void io_terminate_zcrx(struct io_ring_ctx *ctx);
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len);
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id);
#else
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
static inline int io_register_zcrx(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
return -EOPNOTSUPP;
}
static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
static inline void io_unregister_zcrx(struct io_ring_ctx *ctx)
{
}
static inline void io_terminate_zcrx(struct io_ring_ctx *ctx)
{
}
static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,