mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 06:44:00 -04:00
Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
- Add a callback driven main loop for io_uring, and BPF struct_ops
on top to allow implementing custom event loop logic
- Decouple IOPOLL from being a ring-wide all-or-nothing setting,
allowing IOPOLL use cases to also issue certain white listed
non-polled opcodes
- Timeout improvements. Migrate internal timeout storage from
timespec64 to ktime_t for simpler arithmetic and avoid copying of
timespec data
- Zero-copy receive (zcrx) updates:
- Add a device-less mode (ZCRX_REG_NODEV) for testing and
experimentation where data flows through the copy fallback path
- Fix two-step unregistration regression, DMA length calculations,
xarray mark usage, and a potential 32-bit overflow in id
shifting
- Refactoring toward multi-area support: dedicated refill queue
struct, consolidated DMA syncing, netmem array refilling format,
and guard-based locking
- Zero-copy transmit (zctx) cleanup:
- Unify io_send_zc() and io_sendmsg_zc() into a single function
- Add vectorized registered buffer send for IORING_OP_SEND_ZC
- Add separate notification user_data via sqe->addr3 so
notification and completion CQEs can be distinguished without
extra reference counting
- Switch struct io_ring_ctx internal bitfields to explicit flag bits
with atomic-safe accessors, and annotate the known harmless races on
those flags
- Various optimizations caching ctx and other request fields in local
variables to avoid repeated loads, and cleanups for tctx setup, ring
fd registration, and read path early returns
* tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits)
io_uring: unify getting ctx from passed in file descriptor
io_uring/register: don't get a reference to the registered ring fd
io_uring/tctx: clean up __io_uring_add_tctx_node() error handling
io_uring/tctx: have io_uring_alloc_task_context() return tctx
io_uring/timeout: use 'ctx' consistently
io_uring/rw: clean up __io_read() obsolete comment and early returns
io_uring/zcrx: use correct mmap off constants
io_uring/zcrx: use dma_len for chunk size calculation
io_uring/zcrx: don't clear not allocated niovs
io_uring/zcrx: don't use mark0 for allocating xarray
io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring()
io_uring/zcrx: reject REG_NODEV with large rx_buf_size
io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP
io_uring/rsrc: use io_cache_free() to free node
io_uring/zcrx: rename zcrx [un]register functions
io_uring/zcrx: check ctrl op payload struct sizes
io_uring/zcrx: cache fallback availability in zcrx ctx
io_uring/zcrx: warn on a repeated area append
io_uring/zcrx: consolidate dma syncing
io_uring/zcrx: netmem array as refiling format
...
This commit is contained in:
@@ -786,10 +786,6 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
|
||||
struct nvme_ctrl *ctrl = ioucmd->file->private_data;
|
||||
int ret;
|
||||
|
||||
/* IOPOLL not supported yet */
|
||||
if (issue_flags & IO_URING_F_IOPOLL)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = nvme_uring_cmd_checks(issue_flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include <linux/llist.h>
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
struct iou_loop_params;
|
||||
struct io_uring_bpf_ops;
|
||||
|
||||
enum {
|
||||
/*
|
||||
* A hint to not wake right away but delay until there are enough of
|
||||
@@ -41,6 +44,8 @@ enum io_uring_cmd_flags {
|
||||
IO_URING_F_COMPAT = (1 << 12),
|
||||
};
|
||||
|
||||
struct iou_loop_params;
|
||||
|
||||
struct io_wq_work_node {
|
||||
struct io_wq_work_node *next;
|
||||
};
|
||||
@@ -268,24 +273,30 @@ struct io_alloc_cache {
|
||||
unsigned int init_clear;
|
||||
};
|
||||
|
||||
enum {
|
||||
IO_RING_F_DRAIN_NEXT = BIT(0),
|
||||
IO_RING_F_OP_RESTRICTED = BIT(1),
|
||||
IO_RING_F_REG_RESTRICTED = BIT(2),
|
||||
IO_RING_F_OFF_TIMEOUT_USED = BIT(3),
|
||||
IO_RING_F_DRAIN_ACTIVE = BIT(4),
|
||||
IO_RING_F_HAS_EVFD = BIT(5),
|
||||
/* all CQEs should be posted only by the submitter task */
|
||||
IO_RING_F_TASK_COMPLETE = BIT(6),
|
||||
IO_RING_F_LOCKLESS_CQ = BIT(7),
|
||||
IO_RING_F_SYSCALL_IOPOLL = BIT(8),
|
||||
IO_RING_F_POLL_ACTIVATED = BIT(9),
|
||||
IO_RING_F_DRAIN_DISABLED = BIT(10),
|
||||
IO_RING_F_COMPAT = BIT(11),
|
||||
IO_RING_F_IOWQ_LIMITS_SET = BIT(12),
|
||||
};
|
||||
|
||||
struct io_ring_ctx {
|
||||
/* const or read-mostly hot data */
|
||||
struct {
|
||||
/* ring setup flags */
|
||||
unsigned int flags;
|
||||
unsigned int drain_next: 1;
|
||||
unsigned int op_restricted: 1;
|
||||
unsigned int reg_restricted: 1;
|
||||
unsigned int off_timeout_used: 1;
|
||||
unsigned int drain_active: 1;
|
||||
unsigned int has_evfd: 1;
|
||||
/* all CQEs should be posted only by the submitter task */
|
||||
unsigned int task_complete: 1;
|
||||
unsigned int lockless_cq: 1;
|
||||
unsigned int syscall_iopoll: 1;
|
||||
unsigned int poll_activated: 1;
|
||||
unsigned int drain_disabled: 1;
|
||||
unsigned int compat: 1;
|
||||
unsigned int iowq_limits_set : 1;
|
||||
/* internal state flags IO_RING_F_* flags , mostly read-only */
|
||||
unsigned int int_flags;
|
||||
|
||||
struct task_struct *submitter_task;
|
||||
struct io_rings *rings;
|
||||
@@ -355,6 +366,9 @@ struct io_ring_ctx {
|
||||
struct io_alloc_cache rw_cache;
|
||||
struct io_alloc_cache cmd_cache;
|
||||
|
||||
int (*loop_step)(struct io_ring_ctx *ctx,
|
||||
struct iou_loop_params *);
|
||||
|
||||
/*
|
||||
* Any cancelable uring_cmd is added to this list in
|
||||
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
|
||||
@@ -477,6 +491,8 @@ struct io_ring_ctx {
|
||||
DECLARE_HASHTABLE(napi_ht, 4);
|
||||
#endif
|
||||
|
||||
struct io_uring_bpf_ops *bpf_ops;
|
||||
|
||||
/*
|
||||
* Protection for resize vs mmap races - both the mmap and resize
|
||||
* side will need to grab this lock, to prevent either side from
|
||||
@@ -545,6 +561,7 @@ enum {
|
||||
REQ_F_HAS_METADATA_BIT,
|
||||
REQ_F_IMPORT_BUFFER_BIT,
|
||||
REQ_F_SQE_COPIED_BIT,
|
||||
REQ_F_IOPOLL_BIT,
|
||||
|
||||
/* not a real bit, just to check we're not overflowing the space */
|
||||
__REQ_F_LAST_BIT,
|
||||
@@ -638,6 +655,8 @@ enum {
|
||||
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
|
||||
/* ->sqe_copy() has been called, if necessary */
|
||||
REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
|
||||
/* request must be iopolled to completion (set in ->issue()) */
|
||||
REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT),
|
||||
};
|
||||
|
||||
struct io_tw_req {
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/io_uring/zcrx.h>
|
||||
|
||||
/*
|
||||
* this file is shared with liburing and that has to autodetect
|
||||
* if linux/time_types.h is available or not, it can
|
||||
@@ -341,6 +343,10 @@ enum io_uring_op {
|
||||
|
||||
/*
|
||||
* sqe->timeout_flags
|
||||
*
|
||||
* IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout
|
||||
* value in nanoseconds instead of
|
||||
* pointing to a timespec.
|
||||
*/
|
||||
#define IORING_TIMEOUT_ABS (1U << 0)
|
||||
#define IORING_TIMEOUT_UPDATE (1U << 1)
|
||||
@@ -349,6 +355,7 @@ enum io_uring_op {
|
||||
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
|
||||
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
|
||||
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
|
||||
#define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7)
|
||||
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
|
||||
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
|
||||
/*
|
||||
@@ -1050,100 +1057,6 @@ struct io_timespec {
|
||||
__u64 tv_nsec;
|
||||
};
|
||||
|
||||
/* Zero copy receive refill queue entry */
|
||||
struct io_uring_zcrx_rqe {
|
||||
__u64 off;
|
||||
__u32 len;
|
||||
__u32 __pad;
|
||||
};
|
||||
|
||||
struct io_uring_zcrx_cqe {
|
||||
__u64 off;
|
||||
__u64 __pad;
|
||||
};
|
||||
|
||||
/* The bit from which area id is encoded into offsets */
|
||||
#define IORING_ZCRX_AREA_SHIFT 48
|
||||
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
|
||||
|
||||
struct io_uring_zcrx_offsets {
|
||||
__u32 head;
|
||||
__u32 tail;
|
||||
__u32 rqes;
|
||||
__u32 __resv2;
|
||||
__u64 __resv[2];
|
||||
};
|
||||
|
||||
enum io_uring_zcrx_area_flags {
|
||||
IORING_ZCRX_AREA_DMABUF = 1,
|
||||
};
|
||||
|
||||
struct io_uring_zcrx_area_reg {
|
||||
__u64 addr;
|
||||
__u64 len;
|
||||
__u64 rq_area_token;
|
||||
__u32 flags;
|
||||
__u32 dmabuf_fd;
|
||||
__u64 __resv2[2];
|
||||
};
|
||||
|
||||
enum zcrx_reg_flags {
|
||||
ZCRX_REG_IMPORT = 1,
|
||||
};
|
||||
|
||||
enum zcrx_features {
|
||||
/*
|
||||
* The user can ask for the desired rx page size by passing the
|
||||
* value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
|
||||
*/
|
||||
ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for IORING_REGISTER_ZCRX_IFQ
|
||||
*/
|
||||
struct io_uring_zcrx_ifq_reg {
|
||||
__u32 if_idx;
|
||||
__u32 if_rxq;
|
||||
__u32 rq_entries;
|
||||
__u32 flags;
|
||||
|
||||
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
|
||||
__u64 region_ptr; /* struct io_uring_region_desc * */
|
||||
|
||||
struct io_uring_zcrx_offsets offsets;
|
||||
__u32 zcrx_id;
|
||||
__u32 rx_buf_len;
|
||||
__u64 __resv[3];
|
||||
};
|
||||
|
||||
enum zcrx_ctrl_op {
|
||||
ZCRX_CTRL_FLUSH_RQ,
|
||||
ZCRX_CTRL_EXPORT,
|
||||
|
||||
__ZCRX_CTRL_LAST,
|
||||
};
|
||||
|
||||
struct zcrx_ctrl_flush_rq {
|
||||
__u64 __resv[6];
|
||||
};
|
||||
|
||||
struct zcrx_ctrl_export {
|
||||
__u32 zcrx_fd;
|
||||
__u32 __resv1[11];
|
||||
};
|
||||
|
||||
struct zcrx_ctrl {
|
||||
__u32 zcrx_id;
|
||||
__u32 op; /* see enum zcrx_ctrl_op */
|
||||
__u64 __resv[2];
|
||||
|
||||
union {
|
||||
struct zcrx_ctrl_export zc_export;
|
||||
struct zcrx_ctrl_flush_rq zc_flush;
|
||||
};
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
115
include/uapi/linux/io_uring/zcrx.h
Normal file
115
include/uapi/linux/io_uring/zcrx.h
Normal file
@@ -0,0 +1,115 @@
|
||||
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
|
||||
/*
|
||||
* Header file for the io_uring zerocopy receive (zcrx) interface.
|
||||
*
|
||||
* Copyright (C) 2026 Pavel Begunkov
|
||||
* Copyright (C) 2026 David Wei
|
||||
* Copyright (C) Meta Platforms, Inc.
|
||||
*/
|
||||
#ifndef LINUX_IO_ZCRX_H
|
||||
#define LINUX_IO_ZCRX_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
/* Zero copy receive refill queue entry */
|
||||
struct io_uring_zcrx_rqe {
|
||||
__u64 off;
|
||||
__u32 len;
|
||||
__u32 __pad;
|
||||
};
|
||||
|
||||
struct io_uring_zcrx_cqe {
|
||||
__u64 off;
|
||||
__u64 __pad;
|
||||
};
|
||||
|
||||
/* The bit from which area id is encoded into offsets */
|
||||
#define IORING_ZCRX_AREA_SHIFT 48
|
||||
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
|
||||
|
||||
struct io_uring_zcrx_offsets {
|
||||
__u32 head;
|
||||
__u32 tail;
|
||||
__u32 rqes;
|
||||
__u32 __resv2;
|
||||
__u64 __resv[2];
|
||||
};
|
||||
|
||||
enum io_uring_zcrx_area_flags {
|
||||
IORING_ZCRX_AREA_DMABUF = 1,
|
||||
};
|
||||
|
||||
struct io_uring_zcrx_area_reg {
|
||||
__u64 addr;
|
||||
__u64 len;
|
||||
__u64 rq_area_token;
|
||||
__u32 flags;
|
||||
__u32 dmabuf_fd;
|
||||
__u64 __resv2[2];
|
||||
};
|
||||
|
||||
enum zcrx_reg_flags {
|
||||
ZCRX_REG_IMPORT = 1,
|
||||
|
||||
/*
|
||||
* Register a zcrx instance without a net device. All data will be
|
||||
* copied. The refill queue entries might not be automatically
|
||||
* consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ.
|
||||
*/
|
||||
ZCRX_REG_NODEV = 2,
|
||||
};
|
||||
|
||||
enum zcrx_features {
|
||||
/*
|
||||
* The user can ask for the desired rx page size by passing the
|
||||
* value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
|
||||
*/
|
||||
ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
|
||||
};
|
||||
|
||||
/*
|
||||
* Argument for IORING_REGISTER_ZCRX_IFQ
|
||||
*/
|
||||
struct io_uring_zcrx_ifq_reg {
|
||||
__u32 if_idx;
|
||||
__u32 if_rxq;
|
||||
__u32 rq_entries;
|
||||
__u32 flags;
|
||||
|
||||
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
|
||||
__u64 region_ptr; /* struct io_uring_region_desc * */
|
||||
|
||||
struct io_uring_zcrx_offsets offsets;
|
||||
__u32 zcrx_id;
|
||||
__u32 rx_buf_len;
|
||||
__u64 __resv[3];
|
||||
};
|
||||
|
||||
enum zcrx_ctrl_op {
|
||||
ZCRX_CTRL_FLUSH_RQ,
|
||||
ZCRX_CTRL_EXPORT,
|
||||
|
||||
__ZCRX_CTRL_LAST,
|
||||
};
|
||||
|
||||
struct zcrx_ctrl_flush_rq {
|
||||
__u64 __resv[6];
|
||||
};
|
||||
|
||||
struct zcrx_ctrl_export {
|
||||
__u32 zcrx_fd;
|
||||
__u32 __resv1[11];
|
||||
};
|
||||
|
||||
struct zcrx_ctrl {
|
||||
__u32 zcrx_id;
|
||||
__u32 op; /* see enum zcrx_ctrl_op */
|
||||
__u64 __resv[2];
|
||||
|
||||
union {
|
||||
struct zcrx_ctrl_export zc_export;
|
||||
struct zcrx_ctrl_flush_rq zc_flush;
|
||||
};
|
||||
};
|
||||
|
||||
#endif /* LINUX_IO_ZCRX_H */
|
||||
@@ -14,3 +14,8 @@ config IO_URING_BPF
|
||||
def_bool y
|
||||
depends on BPF
|
||||
depends on NET
|
||||
|
||||
config IO_URING_BPF_OPS
|
||||
def_bool y
|
||||
depends on IO_URING
|
||||
depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
|
||||
|
||||
@@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
|
||||
advise.o openclose.o statx.o timeout.o \
|
||||
cancel.o waitid.o register.o \
|
||||
truncate.o memmap.o alloc_cache.o \
|
||||
query.o
|
||||
query.o loop.o
|
||||
|
||||
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
|
||||
obj-$(CONFIG_IO_WQ) += io-wq.o
|
||||
@@ -25,3 +25,4 @@ obj-$(CONFIG_NET) += net.o cmd_net.o
|
||||
obj-$(CONFIG_PROC_FS) += fdinfo.o
|
||||
obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
|
||||
obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o
|
||||
obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o
|
||||
|
||||
270
io_uring/bpf-ops.c
Normal file
270
io_uring/bpf-ops.c
Normal file
@@ -0,0 +1,270 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/bpf_verifier.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "register.h"
|
||||
#include "loop.h"
|
||||
#include "memmap.h"
|
||||
#include "bpf-ops.h"
|
||||
|
||||
static DEFINE_MUTEX(io_bpf_ctrl_mutex);
|
||||
static const struct btf_type *loop_params_type;
|
||||
|
||||
__bpf_kfunc_start_defs();
|
||||
|
||||
__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx, u32 nr)
|
||||
{
|
||||
return io_submit_sqes(ctx, nr);
|
||||
}
|
||||
|
||||
__bpf_kfunc
|
||||
__u8 *bpf_io_uring_get_region(struct io_ring_ctx *ctx, __u32 region_id,
|
||||
const size_t rdwr_buf_size)
|
||||
{
|
||||
struct io_mapped_region *r;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
switch (region_id) {
|
||||
case IOU_REGION_MEM:
|
||||
r = &ctx->param_region;
|
||||
break;
|
||||
case IOU_REGION_CQ:
|
||||
r = &ctx->ring_region;
|
||||
break;
|
||||
case IOU_REGION_SQ:
|
||||
r = &ctx->sq_region;
|
||||
break;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (unlikely(rdwr_buf_size > io_region_size(r)))
|
||||
return NULL;
|
||||
return io_region_get_ptr(r);
|
||||
}
|
||||
|
||||
__bpf_kfunc_end_defs();
|
||||
|
||||
BTF_KFUNCS_START(io_uring_kfunc_set)
|
||||
BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
|
||||
BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL);
|
||||
BTF_KFUNCS_END(io_uring_kfunc_set)
|
||||
|
||||
static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = {
|
||||
.owner = THIS_MODULE,
|
||||
.set = &io_uring_kfunc_set,
|
||||
};
|
||||
|
||||
static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx,
|
||||
struct iou_loop_params *lp)
|
||||
{
|
||||
return IOU_LOOP_STOP;
|
||||
}
|
||||
|
||||
static struct io_uring_bpf_ops io_bpf_ops_stubs = {
|
||||
.loop_step = io_bpf_ops__loop_step,
|
||||
};
|
||||
|
||||
static bool bpf_io_is_valid_access(int off, int size,
|
||||
enum bpf_access_type type,
|
||||
const struct bpf_prog *prog,
|
||||
struct bpf_insn_access_aux *info)
|
||||
{
|
||||
if (type != BPF_READ)
|
||||
return false;
|
||||
if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
|
||||
return false;
|
||||
if (off % size != 0)
|
||||
return false;
|
||||
|
||||
return btf_ctx_access(off, size, type, prog, info);
|
||||
}
|
||||
|
||||
static int bpf_io_btf_struct_access(struct bpf_verifier_log *log,
|
||||
const struct bpf_reg_state *reg, int off,
|
||||
int size)
|
||||
{
|
||||
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
|
||||
|
||||
if (t == loop_params_type) {
|
||||
if (off + size <= offsetofend(struct iou_loop_params, cq_wait_idx))
|
||||
return SCALAR_VALUE;
|
||||
}
|
||||
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
static const struct bpf_verifier_ops bpf_io_verifier_ops = {
|
||||
.get_func_proto = bpf_base_func_proto,
|
||||
.is_valid_access = bpf_io_is_valid_access,
|
||||
.btf_struct_access = bpf_io_btf_struct_access,
|
||||
};
|
||||
|
||||
static const struct btf_type *
|
||||
io_lookup_struct_type(struct btf *btf, const char *name)
|
||||
{
|
||||
s32 type_id;
|
||||
|
||||
type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
|
||||
if (type_id < 0)
|
||||
return NULL;
|
||||
return btf_type_by_id(btf, type_id);
|
||||
}
|
||||
|
||||
static int bpf_io_init(struct btf *btf)
|
||||
{
|
||||
int ret;
|
||||
|
||||
loop_params_type = io_lookup_struct_type(btf, "iou_loop_params");
|
||||
if (!loop_params_type) {
|
||||
pr_err("io_uring: Failed to locate iou_loop_params\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
|
||||
&bpf_io_uring_kfunc_set);
|
||||
if (ret) {
|
||||
pr_err("io_uring: Failed to register kfuncs (%d)\n", ret);
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bpf_io_check_member(const struct btf_type *t,
|
||||
const struct btf_member *member,
|
||||
const struct bpf_prog *prog)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bpf_io_init_member(const struct btf_type *t,
|
||||
const struct btf_member *member,
|
||||
void *kdata, const void *udata)
|
||||
{
|
||||
u32 moff = __btf_member_bit_offset(t, member) / 8;
|
||||
const struct io_uring_bpf_ops *uops = udata;
|
||||
struct io_uring_bpf_ops *ops = kdata;
|
||||
|
||||
switch (moff) {
|
||||
case offsetof(struct io_uring_bpf_ops, ring_fd):
|
||||
ops->ring_fd = uops->ring_fd;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops)
|
||||
{
|
||||
if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL))
|
||||
return -EOPNOTSUPP;
|
||||
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (ctx->bpf_ops)
|
||||
return -EBUSY;
|
||||
if (WARN_ON_ONCE(!ops->loop_step))
|
||||
return -EINVAL;
|
||||
|
||||
ops->priv = ctx;
|
||||
ctx->bpf_ops = ops;
|
||||
ctx->loop_step = ops->loop_step;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bpf_io_reg(void *kdata, struct bpf_link *link)
|
||||
{
|
||||
struct io_uring_bpf_ops *ops = kdata;
|
||||
struct io_ring_ctx *ctx;
|
||||
struct file *file;
|
||||
int ret = -EBUSY;
|
||||
|
||||
file = io_uring_ctx_get_file(ops->ring_fd, false);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
ctx = file->private_data;
|
||||
|
||||
scoped_guard(mutex, &io_bpf_ctrl_mutex) {
|
||||
guard(mutex)(&ctx->uring_lock);
|
||||
ret = io_install_bpf(ctx, ops);
|
||||
}
|
||||
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void io_eject_bpf(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_uring_bpf_ops *ops = ctx->bpf_ops;
|
||||
|
||||
if (WARN_ON_ONCE(!ops))
|
||||
return;
|
||||
if (WARN_ON_ONCE(ops->priv != ctx))
|
||||
return;
|
||||
|
||||
ops->priv = NULL;
|
||||
ctx->bpf_ops = NULL;
|
||||
ctx->loop_step = NULL;
|
||||
}
|
||||
|
||||
static void bpf_io_unreg(void *kdata, struct bpf_link *link)
|
||||
{
|
||||
struct io_uring_bpf_ops *ops = kdata;
|
||||
struct io_ring_ctx *ctx;
|
||||
|
||||
guard(mutex)(&io_bpf_ctrl_mutex);
|
||||
ctx = ops->priv;
|
||||
if (ctx) {
|
||||
guard(mutex)(&ctx->uring_lock);
|
||||
if (WARN_ON_ONCE(ctx->bpf_ops != ops))
|
||||
return;
|
||||
|
||||
io_eject_bpf(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void io_unregister_bpf_ops(struct io_ring_ctx *ctx)
|
||||
{
|
||||
/*
|
||||
* ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock,
|
||||
* and read protected by either. Try to avoid taking the global lock
|
||||
* for rings that never had any bpf installed.
|
||||
*/
|
||||
scoped_guard(mutex, &ctx->uring_lock) {
|
||||
if (!ctx->bpf_ops)
|
||||
return;
|
||||
}
|
||||
|
||||
guard(mutex)(&io_bpf_ctrl_mutex);
|
||||
guard(mutex)(&ctx->uring_lock);
|
||||
if (ctx->bpf_ops)
|
||||
io_eject_bpf(ctx);
|
||||
}
|
||||
|
||||
static struct bpf_struct_ops bpf_ring_ops = {
|
||||
.verifier_ops = &bpf_io_verifier_ops,
|
||||
.reg = bpf_io_reg,
|
||||
.unreg = bpf_io_unreg,
|
||||
.check_member = bpf_io_check_member,
|
||||
.init_member = bpf_io_init_member,
|
||||
.init = bpf_io_init,
|
||||
.cfi_stubs = &io_bpf_ops_stubs,
|
||||
.name = "io_uring_bpf_ops",
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init io_uring_bpf_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_bpf_struct_ops(&bpf_ring_ops, io_uring_bpf_ops);
|
||||
if (ret) {
|
||||
pr_err("io_uring: Failed to register struct_ops (%d)\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
__initcall(io_uring_bpf_init);
|
||||
28
io_uring/bpf-ops.h
Normal file
28
io_uring/bpf-ops.h
Normal file
@@ -0,0 +1,28 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef IOU_BPF_OPS_H
|
||||
#define IOU_BPF_OPS_H
|
||||
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
enum {
|
||||
IOU_REGION_MEM,
|
||||
IOU_REGION_CQ,
|
||||
IOU_REGION_SQ,
|
||||
};
|
||||
|
||||
struct io_uring_bpf_ops {
|
||||
int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp);
|
||||
|
||||
__u32 ring_fd;
|
||||
void *priv;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_IO_URING_BPF_OPS
|
||||
void io_unregister_bpf_ops(struct io_ring_ctx *ctx);
|
||||
#else
|
||||
static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* IOU_BPF_OPS_H */
|
||||
@@ -156,9 +156,16 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
cancel->fd = READ_ONCE(sqe->fd);
|
||||
}
|
||||
if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
|
||||
u32 op;
|
||||
|
||||
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
|
||||
return -EINVAL;
|
||||
cancel->opcode = READ_ONCE(sqe->len);
|
||||
|
||||
op = READ_ONCE(sqe->len);
|
||||
if (op >= IORING_OP_LAST)
|
||||
return -EINVAL;
|
||||
|
||||
cancel->opcode = op;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -7,6 +7,21 @@
|
||||
#include "uring_cmd.h"
|
||||
#include "io_uring.h"
|
||||
|
||||
static int io_uring_cmd_get_sock_ioctl(struct socket *sock, int op)
|
||||
{
|
||||
struct sock *sk = sock->sk;
|
||||
struct proto *prot = READ_ONCE(sk->sk_prot);
|
||||
int ret, arg = 0;
|
||||
|
||||
if (!prot || !prot->ioctl)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = prot->ioctl(sk, op, &arg);
|
||||
if (ret)
|
||||
return ret;
|
||||
return arg;
|
||||
}
|
||||
|
||||
static inline int io_uring_cmd_getsockopt(struct socket *sock,
|
||||
struct io_uring_cmd *cmd,
|
||||
unsigned int issue_flags)
|
||||
@@ -156,27 +171,12 @@ static int io_uring_cmd_getsockname(struct socket *sock,
|
||||
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
||||
{
|
||||
struct socket *sock = cmd->file->private_data;
|
||||
struct sock *sk = sock->sk;
|
||||
struct proto *prot = READ_ONCE(sk->sk_prot);
|
||||
int ret, arg = 0;
|
||||
|
||||
switch (cmd->cmd_op) {
|
||||
case SOCKET_URING_OP_SIOCINQ:
|
||||
if (!prot || !prot->ioctl)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = prot->ioctl(sk, SIOCINQ, &arg);
|
||||
if (ret)
|
||||
return ret;
|
||||
return arg;
|
||||
return io_uring_cmd_get_sock_ioctl(sock, SIOCINQ);
|
||||
case SOCKET_URING_OP_SIOCOUTQ:
|
||||
if (!prot || !prot->ioctl)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
|
||||
if (ret)
|
||||
return ret;
|
||||
return arg;
|
||||
return io_uring_cmd_get_sock_ioctl(sock, SIOCOUTQ);
|
||||
case SOCKET_URING_OP_GETSOCKOPT:
|
||||
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
|
||||
case SOCKET_URING_OP_SETSOCKOPT:
|
||||
|
||||
@@ -148,7 +148,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
|
||||
ev_fd->eventfd_async = eventfd_async;
|
||||
ctx->has_evfd = true;
|
||||
ctx->int_flags |= IO_RING_F_HAS_EVFD;
|
||||
refcount_set(&ev_fd->refs, 1);
|
||||
atomic_set(&ev_fd->ops, 0);
|
||||
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
|
||||
@@ -162,7 +162,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
|
||||
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
|
||||
lockdep_is_held(&ctx->uring_lock));
|
||||
if (ev_fd) {
|
||||
ctx->has_evfd = false;
|
||||
ctx->int_flags &= ~IO_RING_F_HAS_EVFD;
|
||||
rcu_assign_pointer(ctx->io_ev_fd, NULL);
|
||||
io_eventfd_put(ev_fd);
|
||||
return 0;
|
||||
|
||||
@@ -87,6 +87,7 @@
|
||||
#include "msg_ring.h"
|
||||
#include "memmap.h"
|
||||
#include "zcrx.h"
|
||||
#include "bpf-ops.h"
|
||||
|
||||
#include "timeout.h"
|
||||
#include "poll.h"
|
||||
@@ -95,6 +96,7 @@
|
||||
#include "eventfd.h"
|
||||
#include "wait.h"
|
||||
#include "bpf_filter.h"
|
||||
#include "loop.h"
|
||||
|
||||
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
|
||||
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
|
||||
@@ -356,7 +358,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
|
||||
static void io_prep_async_work(struct io_kiocb *req)
|
||||
{
|
||||
const struct io_issue_def *def = &io_issue_defs[req->opcode];
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (!(req->flags & REQ_F_CREDS)) {
|
||||
req->flags |= REQ_F_CREDS;
|
||||
@@ -378,7 +379,7 @@ static void io_prep_async_work(struct io_kiocb *req)
|
||||
if (should_hash && (req->file->f_flags & O_DIRECT) &&
|
||||
(req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
|
||||
should_hash = false;
|
||||
if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
|
||||
if (should_hash || (req->flags & REQ_F_IOPOLL))
|
||||
io_wq_hash_work(&req->work, file_inode(req->file));
|
||||
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
|
||||
if (def->unbound_nonreg_file)
|
||||
@@ -477,17 +478,17 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
|
||||
|
||||
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (ctx->poll_activated)
|
||||
if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED)
|
||||
io_poll_wq_wake(ctx);
|
||||
if (ctx->off_timeout_used)
|
||||
if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED)
|
||||
io_flush_timeouts(ctx);
|
||||
if (ctx->has_evfd)
|
||||
if (ctx->int_flags & IO_RING_F_HAS_EVFD)
|
||||
io_eventfd_signal(ctx, true);
|
||||
}
|
||||
|
||||
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (!ctx->lockless_cq)
|
||||
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ))
|
||||
spin_lock(&ctx->completion_lock);
|
||||
}
|
||||
|
||||
@@ -500,11 +501,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
|
||||
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
|
||||
{
|
||||
io_commit_cqring(ctx);
|
||||
if (!ctx->task_complete) {
|
||||
if (!ctx->lockless_cq)
|
||||
if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) {
|
||||
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ))
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
/* IOPOLL rings only need to wake up if it's also SQPOLL */
|
||||
if (!ctx->syscall_iopoll)
|
||||
if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL))
|
||||
io_cqring_wake(ctx);
|
||||
}
|
||||
io_commit_cqring_flush(ctx);
|
||||
@@ -589,6 +590,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
}
|
||||
|
||||
void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx)
|
||||
{
|
||||
__io_cqring_overflow_flush(ctx, false);
|
||||
}
|
||||
|
||||
/* must to be called somewhat shortly after putting a request */
|
||||
static inline void io_put_task(struct io_kiocb *req)
|
||||
{
|
||||
@@ -830,7 +836,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
|
||||
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
|
||||
{
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
lockdep_assert(ctx->lockless_cq);
|
||||
lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ);
|
||||
|
||||
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
|
||||
struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
|
||||
@@ -860,7 +866,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
|
||||
lockdep_assert(!io_wq_current_is_worker());
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
if (!ctx->lockless_cq) {
|
||||
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) {
|
||||
spin_lock(&ctx->completion_lock);
|
||||
posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
@@ -885,7 +891,7 @@ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2])
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
cqe[0].user_data = req->cqe.user_data;
|
||||
if (!ctx->lockless_cq) {
|
||||
if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) {
|
||||
spin_lock(&ctx->completion_lock);
|
||||
posted = io_fill_cqe_aux32(ctx, cqe);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
@@ -913,7 +919,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
|
||||
* Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
|
||||
* the submitter task context, IOPOLL protects with uring_lock.
|
||||
*/
|
||||
if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
|
||||
if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) {
|
||||
defer_complete:
|
||||
req->io_task_work.func = io_req_task_complete;
|
||||
io_req_task_work_add(req);
|
||||
@@ -1067,12 +1073,14 @@ void io_queue_next(struct io_kiocb *req)
|
||||
|
||||
static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (req->file_node) {
|
||||
io_put_rsrc_node(req->ctx, req->file_node);
|
||||
io_put_rsrc_node(ctx, req->file_node);
|
||||
req->file_node = NULL;
|
||||
}
|
||||
if (req->flags & REQ_F_BUF_NODE)
|
||||
io_put_rsrc_node(req->ctx, req->buf_node);
|
||||
io_put_rsrc_node(ctx, req->buf_node);
|
||||
}
|
||||
|
||||
static void io_free_batch_list(struct io_ring_ctx *ctx,
|
||||
@@ -1135,7 +1143,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
|
||||
*/
|
||||
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
|
||||
unlikely(!io_fill_cqe_req(ctx, req))) {
|
||||
if (ctx->lockless_cq)
|
||||
if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ)
|
||||
io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
|
||||
else
|
||||
io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
|
||||
@@ -1148,7 +1156,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
|
||||
INIT_WQ_LIST(&state->compl_reqs);
|
||||
}
|
||||
|
||||
if (unlikely(ctx->drain_active))
|
||||
if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE))
|
||||
io_queue_deferred(ctx);
|
||||
|
||||
ctx->submit_state.cq_flush = false;
|
||||
@@ -1187,7 +1195,6 @@ __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
|
||||
|
||||
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
|
||||
{
|
||||
unsigned int nr_events = 0;
|
||||
unsigned long check_cq;
|
||||
|
||||
min_events = min(min_events, ctx->cq_entries);
|
||||
@@ -1230,8 +1237,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
|
||||
* very same mutex.
|
||||
*/
|
||||
if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) {
|
||||
u32 tail = ctx->cached_cq_tail;
|
||||
|
||||
(void) io_run_local_work_locked(ctx, min_events);
|
||||
|
||||
if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) {
|
||||
@@ -1240,7 +1245,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
}
|
||||
/* some requests don't go through iopoll_list */
|
||||
if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list))
|
||||
if (list_empty(&ctx->iopoll_list))
|
||||
break;
|
||||
}
|
||||
ret = io_do_iopoll(ctx, !min_events);
|
||||
@@ -1251,9 +1256,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
|
||||
return -EINTR;
|
||||
if (need_resched())
|
||||
break;
|
||||
|
||||
nr_events += ret;
|
||||
} while (nr_events < min_events);
|
||||
} while (io_cqring_events(ctx) < min_events);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1344,7 +1347,7 @@ static __cold void io_drain_req(struct io_kiocb *req)
|
||||
list_add_tail(&de->list, &ctx->defer_list);
|
||||
io_queue_deferred(ctx);
|
||||
if (!drain && list_empty(&ctx->defer_list))
|
||||
ctx->drain_active = false;
|
||||
ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE;
|
||||
}
|
||||
|
||||
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
|
||||
@@ -1418,8 +1421,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
|
||||
if (ret == IOU_ISSUE_SKIP_COMPLETE) {
|
||||
ret = 0;
|
||||
|
||||
/* If the op doesn't have a file, we're not polling for it */
|
||||
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
|
||||
if (req->flags & REQ_F_IOPOLL)
|
||||
io_iopoll_req_issued(req, issue_flags);
|
||||
}
|
||||
return ret;
|
||||
@@ -1435,7 +1437,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)
|
||||
io_tw_lock(req->ctx, tw);
|
||||
|
||||
WARN_ON_ONCE(!req->file);
|
||||
if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL))
|
||||
return -EFAULT;
|
||||
|
||||
ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);
|
||||
@@ -1533,7 +1535,7 @@ fail:
|
||||
* wait for request slots on the block side.
|
||||
*/
|
||||
if (!needs_poll) {
|
||||
if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
if (!(req->flags & REQ_F_IOPOLL))
|
||||
break;
|
||||
if (io_wq_worker_stopped())
|
||||
break;
|
||||
@@ -1655,7 +1657,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
|
||||
} else {
|
||||
/* can't fail with IO_URING_F_INLINE */
|
||||
io_req_sqe_copy(req, IO_URING_F_INLINE);
|
||||
if (unlikely(req->ctx->drain_active))
|
||||
if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE))
|
||||
io_drain_req(req);
|
||||
else
|
||||
io_queue_iowq(req);
|
||||
@@ -1671,7 +1673,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
|
||||
struct io_kiocb *req,
|
||||
unsigned int sqe_flags)
|
||||
{
|
||||
if (!ctx->op_restricted)
|
||||
if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED))
|
||||
return true;
|
||||
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
|
||||
return false;
|
||||
@@ -1691,7 +1693,7 @@ static void io_init_drain(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_kiocb *head = ctx->submit_state.link.head;
|
||||
|
||||
ctx->drain_active = true;
|
||||
ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE;
|
||||
if (head) {
|
||||
/*
|
||||
* If we need to drain a request in the middle of a link, drain
|
||||
@@ -1701,7 +1703,7 @@ static void io_init_drain(struct io_ring_ctx *ctx)
|
||||
* link.
|
||||
*/
|
||||
head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
|
||||
ctx->drain_next = true;
|
||||
ctx->int_flags |= IO_RING_F_DRAIN_NEXT;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1767,23 +1769,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
|
||||
req->buf_index = READ_ONCE(sqe->buf_group);
|
||||
}
|
||||
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
|
||||
ctx->drain_disabled = true;
|
||||
ctx->int_flags |= IO_RING_F_DRAIN_DISABLED;
|
||||
if (sqe_flags & IOSQE_IO_DRAIN) {
|
||||
if (ctx->drain_disabled)
|
||||
if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED)
|
||||
return io_init_fail_req(req, -EOPNOTSUPP);
|
||||
io_init_drain(ctx);
|
||||
}
|
||||
}
|
||||
if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) {
|
||||
if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) {
|
||||
if (!io_check_restriction(ctx, req, sqe_flags))
|
||||
return io_init_fail_req(req, -EACCES);
|
||||
/* knock it to the slow queue path, will be drained there */
|
||||
if (ctx->drain_active)
|
||||
if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)
|
||||
req->flags |= REQ_F_FORCE_ASYNC;
|
||||
/* if there is no link, we're at "next" request and need to drain */
|
||||
if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
|
||||
ctx->drain_next = false;
|
||||
ctx->drain_active = true;
|
||||
if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) {
|
||||
ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT;
|
||||
ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE;
|
||||
req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
|
||||
}
|
||||
}
|
||||
@@ -2148,12 +2150,13 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
|
||||
|
||||
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
|
||||
{
|
||||
io_unregister_bpf_ops(ctx);
|
||||
io_sq_thread_finish(ctx);
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
io_sqe_buffers_unregister(ctx);
|
||||
io_sqe_files_unregister(ctx);
|
||||
io_unregister_zcrx_ifqs(ctx);
|
||||
io_unregister_zcrx(ctx);
|
||||
io_cqring_overflow_kill(ctx);
|
||||
io_eventfd_unregister(ctx);
|
||||
io_free_alloc_caches(ctx);
|
||||
@@ -2204,7 +2207,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
|
||||
poll_wq_task_work);
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
ctx->poll_activated = true;
|
||||
ctx->int_flags |= IO_RING_F_POLL_ACTIVATED;
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
/*
|
||||
@@ -2219,9 +2222,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
|
||||
{
|
||||
spin_lock(&ctx->completion_lock);
|
||||
/* already activated or in progress */
|
||||
if (ctx->poll_activated || ctx->poll_wq_task_work.func)
|
||||
if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func)
|
||||
goto out;
|
||||
if (WARN_ON_ONCE(!ctx->task_complete))
|
||||
if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)))
|
||||
goto out;
|
||||
if (!ctx->submitter_task)
|
||||
goto out;
|
||||
@@ -2242,7 +2245,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
|
||||
struct io_ring_ctx *ctx = file->private_data;
|
||||
__poll_t mask = 0;
|
||||
|
||||
if (unlikely(!ctx->poll_activated))
|
||||
if (unlikely(!(data_race(ctx->int_flags) & IO_RING_F_POLL_ACTIVATED)))
|
||||
io_activate_pollwq(ctx);
|
||||
/*
|
||||
* provides mb() which pairs with barrier from wq_has_sleeper
|
||||
@@ -2308,6 +2311,10 @@ static __cold void io_ring_exit_work(struct work_struct *work)
|
||||
struct io_tctx_node *node;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
io_terminate_zcrx(ctx);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
/*
|
||||
* If we're doing polled IO and end up having requests being
|
||||
* submitted async (out-of-line), then completions can come in while
|
||||
@@ -2539,6 +2546,40 @@ uaccess_end:
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
|
||||
* true, then the registered index is used. Otherwise, the normal fd table.
|
||||
* Caller must call fput() on the returned file if it isn't a registered file,
|
||||
* unless it's an ERR_PTR.
|
||||
*/
|
||||
struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
|
||||
{
|
||||
struct file *file;
|
||||
|
||||
if (registered) {
|
||||
/*
|
||||
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
|
||||
* need only dereference our task private array to find it.
|
||||
*/
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
|
||||
return ERR_PTR(-EINVAL);
|
||||
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
|
||||
file = tctx->registered_rings[fd];
|
||||
} else {
|
||||
file = fget(fd);
|
||||
}
|
||||
|
||||
if (unlikely(!file))
|
||||
return ERR_PTR(-EBADF);
|
||||
if (io_is_uring_fops(file))
|
||||
return file;
|
||||
fput(file);
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
}
|
||||
|
||||
|
||||
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
u32, min_complete, u32, flags, const void __user *, argp,
|
||||
size_t, argsz)
|
||||
@@ -2550,28 +2591,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
if (unlikely(flags & ~IORING_ENTER_FLAGS))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
|
||||
* need only dereference our task private array to find it.
|
||||
*/
|
||||
if (flags & IORING_ENTER_REGISTERED_RING) {
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
|
||||
return -EINVAL;
|
||||
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
|
||||
file = tctx->registered_rings[fd];
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
} else {
|
||||
file = fget(fd);
|
||||
if (unlikely(!file))
|
||||
return -EBADF;
|
||||
ret = -EOPNOTSUPP;
|
||||
if (unlikely(!io_is_uring_fops(file)))
|
||||
goto out;
|
||||
}
|
||||
|
||||
file = io_uring_ctx_get_file(fd, flags & IORING_ENTER_REGISTERED_RING);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
ctx = file->private_data;
|
||||
ret = -EBADFD;
|
||||
/*
|
||||
@@ -2581,6 +2603,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED))
|
||||
goto out;
|
||||
|
||||
if (io_has_loop_ops(ctx)) {
|
||||
ret = io_run_loop(ctx);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* For SQ polling, the thread will do all submissions and completions.
|
||||
* Just return the requested submit count, and wake the thread if
|
||||
@@ -2610,7 +2637,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
goto out;
|
||||
}
|
||||
if (flags & IORING_ENTER_GETEVENTS) {
|
||||
if (ctx->syscall_iopoll)
|
||||
if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)
|
||||
goto iopoll_locked;
|
||||
/*
|
||||
* Ignore errors, we'll soon call io_cqring_wait() and
|
||||
@@ -2625,7 +2652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
|
||||
if (flags & IORING_ENTER_GETEVENTS) {
|
||||
int ret2;
|
||||
|
||||
if (ctx->syscall_iopoll) {
|
||||
if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) {
|
||||
/*
|
||||
* We disallow the app entering submit/complete with
|
||||
* polling, but we still need to lock the ring to
|
||||
@@ -2926,9 +2953,9 @@ static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
|
||||
if (dst->bpf_filters)
|
||||
WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
|
||||
if (dst->op_registered)
|
||||
ctx->op_restricted = 1;
|
||||
ctx->int_flags |= IO_RING_F_OP_RESTRICTED;
|
||||
if (dst->reg_registered)
|
||||
ctx->reg_restricted = 1;
|
||||
ctx->int_flags |= IO_RING_F_REG_RESTRICTED;
|
||||
}
|
||||
|
||||
static __cold int io_uring_create(struct io_ctx_config *config)
|
||||
@@ -2955,17 +2982,18 @@ static __cold int io_uring_create(struct io_ctx_config *config)
|
||||
|
||||
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
|
||||
!(ctx->flags & IORING_SETUP_IOPOLL))
|
||||
ctx->task_complete = true;
|
||||
ctx->int_flags |= IO_RING_F_TASK_COMPLETE;
|
||||
|
||||
if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
|
||||
ctx->lockless_cq = true;
|
||||
if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) ||
|
||||
(ctx->flags & IORING_SETUP_IOPOLL))
|
||||
ctx->int_flags |= IO_RING_F_LOCKLESS_CQ;
|
||||
|
||||
/*
|
||||
* lazy poll_wq activation relies on ->task_complete for synchronisation
|
||||
* purposes, see io_activate_pollwq()
|
||||
*/
|
||||
if (!ctx->task_complete)
|
||||
ctx->poll_activated = true;
|
||||
if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))
|
||||
ctx->int_flags |= IO_RING_F_POLL_ACTIVATED;
|
||||
|
||||
/*
|
||||
* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
|
||||
@@ -2975,9 +3003,10 @@ static __cold int io_uring_create(struct io_ctx_config *config)
|
||||
*/
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL &&
|
||||
!(ctx->flags & IORING_SETUP_SQPOLL))
|
||||
ctx->syscall_iopoll = 1;
|
||||
ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL;
|
||||
|
||||
ctx->compat = in_compat_syscall();
|
||||
if (in_compat_syscall())
|
||||
ctx->int_flags |= IO_RING_F_COMPAT;
|
||||
if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
|
||||
ctx->user = get_uid(current_user());
|
||||
|
||||
|
||||
@@ -185,6 +185,7 @@ void io_req_track_inflight(struct io_kiocb *req);
|
||||
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
|
||||
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
|
||||
unsigned issue_flags);
|
||||
struct file *io_uring_ctx_get_file(unsigned int fd, bool registered);
|
||||
|
||||
void io_req_task_queue(struct io_kiocb *req);
|
||||
void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw);
|
||||
@@ -223,7 +224,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
|
||||
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL) {
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
} else if (!ctx->task_complete) {
|
||||
} else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) {
|
||||
lockdep_assert_held(&ctx->completion_lock);
|
||||
} else if (ctx->submitter_task) {
|
||||
/*
|
||||
@@ -240,7 +241,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
|
||||
|
||||
static inline bool io_is_compat(struct io_ring_ctx *ctx)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat);
|
||||
return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT);
|
||||
}
|
||||
|
||||
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
|
||||
@@ -494,10 +495,12 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
|
||||
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
|
||||
}
|
||||
|
||||
#define SHOULD_FLUSH_MASK (IO_RING_F_OFF_TIMEOUT_USED | \
|
||||
IO_RING_F_HAS_EVFD | IO_RING_F_POLL_ACTIVATED)
|
||||
|
||||
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (unlikely(ctx->off_timeout_used ||
|
||||
ctx->has_evfd || ctx->poll_activated))
|
||||
if (unlikely(data_race(ctx->int_flags) & SHOULD_FLUSH_MASK))
|
||||
__io_commit_cqring_flush(ctx);
|
||||
}
|
||||
|
||||
|
||||
@@ -230,7 +230,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
struct io_br_sel sel = { };
|
||||
struct io_buffer_list *bl;
|
||||
|
||||
io_ring_submit_lock(req->ctx, issue_flags);
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
|
||||
bl = io_buffer_get_list(ctx, buf_group);
|
||||
if (likely(bl)) {
|
||||
@@ -239,7 +239,7 @@ struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len,
|
||||
else
|
||||
sel.addr = io_provided_buffer_select(req, len, bl);
|
||||
}
|
||||
io_ring_submit_unlock(req->ctx, issue_flags);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return sel;
|
||||
}
|
||||
|
||||
|
||||
91
io_uring/loop.c
Normal file
91
io_uring/loop.c
Normal file
@@ -0,0 +1,91 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#include "io_uring.h"
|
||||
#include "wait.h"
|
||||
#include "loop.h"
|
||||
|
||||
static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx,
|
||||
const struct iou_loop_params *lp)
|
||||
{
|
||||
return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail);
|
||||
}
|
||||
|
||||
static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait)
|
||||
{
|
||||
atomic_set(&ctx->cq_wait_nr, nr_wait);
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
}
|
||||
|
||||
static inline void io_loop_wait_finish(struct io_ring_ctx *ctx)
|
||||
{
|
||||
__set_current_state(TASK_RUNNING);
|
||||
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
|
||||
}
|
||||
|
||||
static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp,
|
||||
unsigned nr_wait)
|
||||
{
|
||||
io_loop_wait_start(ctx, nr_wait);
|
||||
|
||||
if (unlikely(io_local_work_pending(ctx) ||
|
||||
io_loop_nr_cqes(ctx, lp) <= 0) ||
|
||||
READ_ONCE(ctx->check_cq)) {
|
||||
io_loop_wait_finish(ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
schedule();
|
||||
io_loop_wait_finish(ctx);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
}
|
||||
|
||||
static int __io_run_loop(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct iou_loop_params lp = {};
|
||||
|
||||
while (true) {
|
||||
int nr_wait, step_res;
|
||||
|
||||
if (unlikely(!ctx->loop_step))
|
||||
return -EFAULT;
|
||||
|
||||
step_res = ctx->loop_step(ctx, &lp);
|
||||
if (step_res == IOU_LOOP_STOP)
|
||||
break;
|
||||
if (step_res != IOU_LOOP_CONTINUE)
|
||||
return -EINVAL;
|
||||
|
||||
nr_wait = io_loop_nr_cqes(ctx, &lp);
|
||||
if (nr_wait > 0)
|
||||
io_loop_wait(ctx, &lp, nr_wait);
|
||||
else
|
||||
nr_wait = 0;
|
||||
|
||||
if (task_work_pending(current)) {
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
io_run_task_work();
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
}
|
||||
if (unlikely(task_sigpending(current)))
|
||||
return -EINTR;
|
||||
io_run_local_work_locked(ctx, nr_wait);
|
||||
|
||||
if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
|
||||
io_cqring_overflow_flush_locked(ctx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_run_loop(struct io_ring_ctx *ctx)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!io_allowed_run_tw(ctx))
|
||||
return -EEXIST;
|
||||
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
ret = __io_run_loop(ctx);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
return ret;
|
||||
}
|
||||
27
io_uring/loop.h
Normal file
27
io_uring/loop.h
Normal file
@@ -0,0 +1,27 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef IOU_LOOP_H
|
||||
#define IOU_LOOP_H
|
||||
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
struct iou_loop_params {
|
||||
/*
|
||||
* The CQE index to wait for. Only serves as a hint and can still be
|
||||
* woken up earlier.
|
||||
*/
|
||||
__u32 cq_wait_idx;
|
||||
};
|
||||
|
||||
enum {
|
||||
IOU_LOOP_CONTINUE = 0,
|
||||
IOU_LOOP_STOP,
|
||||
};
|
||||
|
||||
static inline bool io_has_loop_ops(struct io_ring_ctx *ctx)
|
||||
{
|
||||
return data_race(ctx->loop_step);
|
||||
}
|
||||
|
||||
int io_run_loop(struct io_ring_ctx *ctx);
|
||||
|
||||
#endif
|
||||
@@ -67,7 +67,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req)
|
||||
|
||||
static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
|
||||
{
|
||||
return target_ctx->task_complete;
|
||||
return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE;
|
||||
}
|
||||
|
||||
static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
|
||||
|
||||
162
io_uring/net.c
162
io_uring/net.c
@@ -375,10 +375,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
kmsg->msg.msg_namelen = addr_len;
|
||||
}
|
||||
if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
|
||||
if (sr->flags & IORING_SEND_VECTORIZED)
|
||||
return -EINVAL;
|
||||
req->flags |= REQ_F_IMPORT_BUFFER;
|
||||
return 0;
|
||||
if (!(sr->flags & IORING_SEND_VECTORIZED)) {
|
||||
req->flags |= REQ_F_IMPORT_BUFFER;
|
||||
return 0;
|
||||
}
|
||||
|
||||
kmsg->msg.msg_iter.nr_segs = sr->len;
|
||||
return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len);
|
||||
}
|
||||
if (req->flags & REQ_F_BUFFER_SELECT)
|
||||
return 0;
|
||||
@@ -396,6 +399,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
|
||||
struct user_msghdr msg;
|
||||
int ret;
|
||||
|
||||
sr->flags |= IORING_SEND_VECTORIZED;
|
||||
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
|
||||
if (unlikely(ret))
|
||||
@@ -1333,11 +1337,12 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_async_msghdr *iomsg;
|
||||
struct io_kiocb *notif;
|
||||
u64 user_data;
|
||||
int ret;
|
||||
|
||||
zc->done_io = 0;
|
||||
|
||||
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
|
||||
if (unlikely(READ_ONCE(sqe->__pad2[0])))
|
||||
return -EINVAL;
|
||||
/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
|
||||
if (req->flags & REQ_F_CQE_SKIP)
|
||||
@@ -1346,7 +1351,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
notif = zc->notif = io_alloc_notif(ctx);
|
||||
if (!notif)
|
||||
return -ENOMEM;
|
||||
notif->cqe.user_data = req->cqe.user_data;
|
||||
user_data = READ_ONCE(sqe->addr3);
|
||||
if (!user_data)
|
||||
user_data = req->cqe.user_data;
|
||||
|
||||
notif->cqe.user_data = user_data;
|
||||
notif->cqe.res = 0;
|
||||
notif->cqe.flags = IORING_CQE_F_NOTIF;
|
||||
req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
|
||||
@@ -1370,7 +1379,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
if (zc->msg_flags & MSG_DONTWAIT)
|
||||
req->flags |= REQ_F_NOWAIT;
|
||||
|
||||
if (io_is_compat(req->ctx))
|
||||
if (io_is_compat(ctx))
|
||||
zc->msg_flags |= MSG_CMSG_COMPAT;
|
||||
|
||||
iomsg = io_msg_alloc_async(req);
|
||||
@@ -1445,22 +1454,39 @@ static int io_sg_from_iter(struct sk_buff *skb,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
|
||||
static int io_send_zc_import(struct io_kiocb *req,
|
||||
struct io_async_msghdr *kmsg,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
|
||||
struct io_async_msghdr *kmsg = req->async_data;
|
||||
struct io_kiocb *notif = sr->notif;
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
|
||||
|
||||
sr->notif->buf_index = req->buf_index;
|
||||
return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter,
|
||||
(u64)(uintptr_t)sr->buf, sr->len,
|
||||
ITER_SOURCE, issue_flags);
|
||||
notif->buf_index = req->buf_index;
|
||||
|
||||
if (!(sr->flags & IORING_SEND_VECTORIZED)) {
|
||||
ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter,
|
||||
(u64)(uintptr_t)sr->buf, sr->len,
|
||||
ITER_SOURCE, issue_flags);
|
||||
} else {
|
||||
unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
|
||||
|
||||
ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
|
||||
notif, &kmsg->vec, uvec_segs,
|
||||
issue_flags);
|
||||
}
|
||||
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
req->flags &= ~REQ_F_IMPORT_BUFFER;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
|
||||
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
|
||||
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
|
||||
struct io_async_msghdr *kmsg = req->async_data;
|
||||
struct socket *sock;
|
||||
unsigned msg_flags;
|
||||
@@ -1471,106 +1497,38 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
|
||||
return -ENOTSOCK;
|
||||
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!(req->flags & REQ_F_POLLED) &&
|
||||
(zc->flags & IORING_RECVSEND_POLL_FIRST))
|
||||
return -EAGAIN;
|
||||
|
||||
if (req->flags & REQ_F_IMPORT_BUFFER) {
|
||||
req->flags &= ~REQ_F_IMPORT_BUFFER;
|
||||
ret = io_send_zc_import(req, issue_flags);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
}
|
||||
|
||||
msg_flags = zc->msg_flags;
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
msg_flags |= MSG_DONTWAIT;
|
||||
if (msg_flags & MSG_WAITALL)
|
||||
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
|
||||
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
|
||||
|
||||
kmsg->msg.msg_flags = msg_flags;
|
||||
kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
|
||||
ret = sock_sendmsg(sock, &kmsg->msg);
|
||||
|
||||
if (unlikely(ret < min_ret)) {
|
||||
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
|
||||
return -EAGAIN;
|
||||
|
||||
if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
|
||||
zc->done_io += ret;
|
||||
return -EAGAIN;
|
||||
}
|
||||
if (ret == -ERESTARTSYS)
|
||||
ret = -EINTR;
|
||||
req_set_fail(req);
|
||||
}
|
||||
|
||||
if (ret >= 0)
|
||||
ret += zc->done_io;
|
||||
else if (zc->done_io)
|
||||
ret = zc->done_io;
|
||||
|
||||
/*
|
||||
* If we're in io-wq we can't rely on tw ordering guarantees, defer
|
||||
* flushing notif to io_send_zc_cleanup()
|
||||
*/
|
||||
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
|
||||
io_notif_flush(zc->notif);
|
||||
zc->notif = NULL;
|
||||
io_req_msg_cleanup(req, 0);
|
||||
}
|
||||
io_req_set_res(req, ret, IORING_CQE_F_MORE);
|
||||
return IOU_COMPLETE;
|
||||
}
|
||||
|
||||
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
|
||||
struct io_async_msghdr *kmsg = req->async_data;
|
||||
struct socket *sock;
|
||||
unsigned flags;
|
||||
int ret, min_ret = 0;
|
||||
|
||||
if (req->flags & REQ_F_IMPORT_BUFFER) {
|
||||
unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
|
||||
int ret;
|
||||
|
||||
sr->notif->buf_index = req->buf_index;
|
||||
ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
|
||||
sr->notif, &kmsg->vec, uvec_segs,
|
||||
issue_flags);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
req->flags &= ~REQ_F_IMPORT_BUFFER;
|
||||
}
|
||||
|
||||
sock = sock_from_file(req->file);
|
||||
if (unlikely(!sock))
|
||||
return -ENOTSOCK;
|
||||
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!(req->flags & REQ_F_POLLED) &&
|
||||
(sr->flags & IORING_RECVSEND_POLL_FIRST))
|
||||
return -EAGAIN;
|
||||
|
||||
flags = sr->msg_flags;
|
||||
if (req->flags & REQ_F_IMPORT_BUFFER) {
|
||||
ret = io_send_zc_import(req, kmsg, issue_flags);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
}
|
||||
|
||||
msg_flags = sr->msg_flags;
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
flags |= MSG_DONTWAIT;
|
||||
if (flags & MSG_WAITALL)
|
||||
msg_flags |= MSG_DONTWAIT;
|
||||
if (msg_flags & MSG_WAITALL)
|
||||
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
|
||||
|
||||
kmsg->msg.msg_control_user = sr->msg_control;
|
||||
kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
|
||||
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
|
||||
|
||||
if (req->opcode == IORING_OP_SEND_ZC) {
|
||||
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
|
||||
kmsg->msg.msg_flags = msg_flags;
|
||||
ret = sock_sendmsg(sock, &kmsg->msg);
|
||||
} else {
|
||||
kmsg->msg.msg_control_user = sr->msg_control;
|
||||
ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags);
|
||||
}
|
||||
|
||||
if (unlikely(ret < min_ret)) {
|
||||
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
|
||||
return -EAGAIN;
|
||||
|
||||
if (ret > 0 && io_net_retry(sock, flags)) {
|
||||
if (ret > 0 && io_net_retry(sock, sr->msg_flags)) {
|
||||
sr->done_io += ret;
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
@@ -50,7 +50,6 @@ void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
|
||||
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_send_zc(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
void io_send_zc_cleanup(struct io_kiocb *req);
|
||||
|
||||
@@ -67,7 +67,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.vectored = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_readv,
|
||||
@@ -82,7 +81,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.vectored = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_writev,
|
||||
@@ -102,7 +100,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_read_fixed,
|
||||
.issue = io_read_fixed,
|
||||
@@ -116,7 +113,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_write_fixed,
|
||||
.issue = io_write_fixed,
|
||||
@@ -250,7 +246,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_read,
|
||||
.issue = io_read,
|
||||
@@ -264,7 +259,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_write,
|
||||
.issue = io_write,
|
||||
@@ -423,7 +417,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.needs_file = 1,
|
||||
.plug = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.async_size = sizeof(struct io_async_cmd),
|
||||
.prep = io_uring_cmd_prep,
|
||||
.issue = io_uring_cmd,
|
||||
@@ -437,7 +430,7 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
#if defined(CONFIG_NET)
|
||||
.async_size = sizeof(struct io_async_msghdr),
|
||||
.prep = io_send_zc_prep,
|
||||
.issue = io_send_zc,
|
||||
.issue = io_sendmsg_zc,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
@@ -556,7 +549,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.vectored = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_readv_fixed,
|
||||
@@ -571,7 +563,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.vectored = 1,
|
||||
.async_size = sizeof(struct io_async_rw),
|
||||
.prep = io_prep_writev_fixed,
|
||||
@@ -593,7 +584,6 @@ const struct io_issue_def io_issue_defs[] = {
|
||||
.needs_file = 1,
|
||||
.plug = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.is_128 = 1,
|
||||
.async_size = sizeof(struct io_async_cmd),
|
||||
.prep = io_uring_cmd_prep,
|
||||
|
||||
@@ -25,8 +25,6 @@ struct io_issue_def {
|
||||
unsigned poll_exclusive : 1;
|
||||
/* skip auditing */
|
||||
unsigned audit_skip : 1;
|
||||
/* have to be put into the iopoll list */
|
||||
unsigned iopoll_queue : 1;
|
||||
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
|
||||
unsigned vectored : 1;
|
||||
/* set to 1 if this opcode uses 128b sqes in a mixed sq */
|
||||
|
||||
@@ -277,8 +277,10 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
|
||||
|
||||
/* the mask was stashed in __io_poll_execute */
|
||||
if (!req->cqe.res) {
|
||||
struct poll_table_struct pt = { ._key = req->apoll_events };
|
||||
req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
|
||||
__poll_t events = req->apoll_events;
|
||||
struct poll_table_struct pt = { ._key = events };
|
||||
|
||||
req->cqe.res = vfs_poll(req->file, &pt) & events;
|
||||
/*
|
||||
* We got woken with a mask, but someone else got to
|
||||
* it first. The above vfs_poll() doesn't add us back
|
||||
@@ -287,7 +289,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
|
||||
*/
|
||||
if (unlikely(!req->cqe.res)) {
|
||||
/* Multishot armed need not reissue */
|
||||
if (!(req->apoll_events & EPOLLONESHOT))
|
||||
if (!(events & EPOLLONESHOT))
|
||||
continue;
|
||||
return IOU_POLL_REISSUE;
|
||||
}
|
||||
|
||||
@@ -34,12 +34,12 @@ static ssize_t io_query_zcrx(union io_query_data *data)
|
||||
{
|
||||
struct io_uring_query_zcrx *e = &data->zcrx;
|
||||
|
||||
e->register_flags = ZCRX_REG_IMPORT;
|
||||
e->register_flags = ZCRX_SUPPORTED_REG_FLAGS;
|
||||
e->area_flags = IORING_ZCRX_AREA_DMABUF;
|
||||
e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST;
|
||||
e->rq_hdr_size = sizeof(struct io_uring);
|
||||
e->rq_hdr_alignment = L1_CACHE_BYTES;
|
||||
e->features = ZCRX_FEATURE_RX_PAGE_SIZE;
|
||||
e->features = ZCRX_FEATURES;
|
||||
e->__resv2 = 0;
|
||||
return sizeof(*e);
|
||||
}
|
||||
|
||||
@@ -192,9 +192,9 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
|
||||
return ret;
|
||||
}
|
||||
if (ctx->restrictions.op_registered)
|
||||
ctx->op_restricted = 1;
|
||||
ctx->int_flags |= IO_RING_F_OP_RESTRICTED;
|
||||
if (ctx->restrictions.reg_registered)
|
||||
ctx->reg_restricted = 1;
|
||||
ctx->int_flags |= IO_RING_F_REG_RESTRICTED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -392,7 +392,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
|
||||
for (i = 0; i < ARRAY_SIZE(new_count); i++)
|
||||
if (new_count[i])
|
||||
ctx->iowq_limits[i] = new_count[i];
|
||||
ctx->iowq_limits_set = true;
|
||||
ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET;
|
||||
|
||||
if (tctx && tctx->io_wq) {
|
||||
ret = io_wq_max_workers(tctx->io_wq, new_count);
|
||||
@@ -733,7 +733,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
if (ctx->submitter_task && ctx->submitter_task != current)
|
||||
return -EEXIST;
|
||||
|
||||
if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
|
||||
if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
|
||||
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
|
||||
if (!test_bit(opcode, ctx->restrictions.register_op))
|
||||
return -EACCES;
|
||||
@@ -908,7 +908,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
ret = -EINVAL;
|
||||
if (!arg || nr_args != 1)
|
||||
break;
|
||||
ret = io_register_zcrx_ifq(ctx, arg);
|
||||
ret = io_register_zcrx(ctx, arg);
|
||||
break;
|
||||
case IORING_REGISTER_RESIZE_RINGS:
|
||||
ret = -EINVAL;
|
||||
@@ -946,40 +946,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
|
||||
* true, then the registered index is used. Otherwise, the normal fd table.
|
||||
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
|
||||
*/
|
||||
struct file *io_uring_register_get_file(unsigned int fd, bool registered)
|
||||
{
|
||||
struct file *file;
|
||||
|
||||
if (registered) {
|
||||
/*
|
||||
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
|
||||
* need only dereference our task private array to find it.
|
||||
*/
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
|
||||
return ERR_PTR(-EINVAL);
|
||||
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
|
||||
file = tctx->registered_rings[fd];
|
||||
if (file)
|
||||
get_file(file);
|
||||
} else {
|
||||
file = fget(fd);
|
||||
}
|
||||
|
||||
if (unlikely(!file))
|
||||
return ERR_PTR(-EBADF);
|
||||
if (io_is_uring_fops(file))
|
||||
return file;
|
||||
fput(file);
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
}
|
||||
|
||||
static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
|
||||
{
|
||||
struct io_uring_sqe sqe;
|
||||
@@ -1034,7 +1000,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
|
||||
if (fd == -1)
|
||||
return io_uring_register_blind(opcode, arg, nr_args);
|
||||
|
||||
file = io_uring_register_get_file(fd, use_registered_ring);
|
||||
file = io_uring_ctx_get_file(fd, use_registered_ring);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
ctx = file->private_data;
|
||||
@@ -1046,6 +1012,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
|
||||
ctx->buf_table.nr, ret);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
fput(file);
|
||||
if (!use_registered_ring)
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -4,6 +4,5 @@
|
||||
|
||||
int io_eventfd_unregister(struct io_ring_ctx *ctx);
|
||||
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
|
||||
struct file *io_uring_register_get_file(unsigned int fd, bool registered);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -295,7 +295,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
|
||||
u64 tag = 0;
|
||||
|
||||
uvec = u64_to_user_ptr(user_data);
|
||||
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
|
||||
iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
|
||||
if (IS_ERR(iov)) {
|
||||
err = PTR_ERR(iov);
|
||||
break;
|
||||
@@ -319,7 +319,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
|
||||
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
|
||||
io_reset_rsrc_node(ctx, &ctx->buf_table, i);
|
||||
ctx->buf_table.nodes[i] = node;
|
||||
if (ctx->compat)
|
||||
if (io_is_compat(ctx))
|
||||
user_data += sizeof(struct compat_iovec);
|
||||
else
|
||||
user_data += sizeof(struct iovec);
|
||||
@@ -883,12 +883,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
|
||||
if (arg) {
|
||||
uvec = (struct iovec __user *) arg;
|
||||
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
|
||||
iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx));
|
||||
if (IS_ERR(iov)) {
|
||||
ret = PTR_ERR(iov);
|
||||
break;
|
||||
}
|
||||
if (ctx->compat)
|
||||
if (io_is_compat(ctx))
|
||||
arg += sizeof(struct compat_iovec);
|
||||
else
|
||||
arg += sizeof(struct iovec);
|
||||
@@ -961,7 +961,7 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
|
||||
*/
|
||||
imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
|
||||
if (!imu) {
|
||||
kfree(node);
|
||||
io_cache_free(&ctx->node_cache, node);
|
||||
ret = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
@@ -1273,7 +1273,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
|
||||
return -EINVAL;
|
||||
|
||||
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
|
||||
file = io_uring_register_get_file(buf.src_fd, registered_src);
|
||||
file = io_uring_ctx_get_file(buf.src_fd, registered_src);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
|
||||
@@ -1295,7 +1295,8 @@ out:
|
||||
if (src_ctx != ctx)
|
||||
mutex_unlock(&src_ctx->uring_lock);
|
||||
|
||||
fput(file);
|
||||
if (!registered_src)
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
|
||||
if (!S_ISBLK(mode) && !S_ISREG(mode))
|
||||
return false;
|
||||
if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
|
||||
!(ctx->flags & IORING_SETUP_IOPOLL)))
|
||||
!(req->flags & REQ_F_IOPOLL)))
|
||||
return false;
|
||||
/*
|
||||
* If ref is dying, we might be running poll reap from the exit work.
|
||||
@@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
|
||||
}
|
||||
}
|
||||
|
||||
if (req->ctx->flags & IORING_SETUP_IOPOLL)
|
||||
if (req->flags & REQ_F_IOPOLL)
|
||||
io_complete_rw_iopoll(&rw->kiocb, ret);
|
||||
else
|
||||
io_complete_rw(&rw->kiocb, ret);
|
||||
@@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
|
||||
|
||||
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
|
||||
req->file->f_pos = rw->kiocb.ki_pos;
|
||||
if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
|
||||
if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) {
|
||||
u32 cflags = 0;
|
||||
|
||||
__io_complete_rw_common(req, ret);
|
||||
@@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL) {
|
||||
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
|
||||
return -EOPNOTSUPP;
|
||||
req->flags |= REQ_F_IOPOLL;
|
||||
kiocb->private = NULL;
|
||||
kiocb->ki_flags |= IOCB_HIPRI;
|
||||
req->iopoll_completed = 0;
|
||||
@@ -899,7 +900,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
|
||||
* We have a union of meta fields with wpq used for buffered-io
|
||||
* in io_async_rw, so fail it here.
|
||||
*/
|
||||
if (!(req->file->f_flags & O_DIRECT))
|
||||
if (!(file->f_flags & O_DIRECT))
|
||||
return -EOPNOTSUPP;
|
||||
kiocb->ki_flags |= IOCB_HAS_METADATA;
|
||||
kiocb->private = &io->meta;
|
||||
@@ -961,13 +962,13 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
|
||||
if (ret == -EAGAIN) {
|
||||
/* If we can poll, just do that. */
|
||||
if (io_file_can_poll(req))
|
||||
return -EAGAIN;
|
||||
return ret;
|
||||
/* IOPOLL retry should happen for io-wq threads */
|
||||
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
goto done;
|
||||
if (!force_nonblock && !(req->flags & REQ_F_IOPOLL))
|
||||
return ret;
|
||||
/* no retry on NONBLOCK nor RWF_NOWAIT */
|
||||
if (req->flags & REQ_F_NOWAIT)
|
||||
goto done;
|
||||
return ret;
|
||||
ret = 0;
|
||||
} else if (ret == -EIOCBQUEUED) {
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
@@ -975,7 +976,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
|
||||
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
|
||||
(issue_flags & IO_URING_F_MULTISHOT)) {
|
||||
/* read all, failed, already did sync or don't want to retry */
|
||||
goto done;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1018,8 +1019,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel,
|
||||
kiocb->ki_flags &= ~IOCB_WAITQ;
|
||||
iov_iter_restore(&io->iter, &io->iter_state);
|
||||
} while (ret > 0);
|
||||
done:
|
||||
/* it's faster to check here than delegate to kfree */
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1188,7 +1188,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
|
||||
goto done;
|
||||
if (!force_nonblock || ret2 != -EAGAIN) {
|
||||
/* IOPOLL retry should happen for io-wq threads */
|
||||
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL))
|
||||
goto ret_eagain;
|
||||
|
||||
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
|
||||
|
||||
@@ -458,6 +458,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
|
||||
return -EINVAL;
|
||||
}
|
||||
if (ctx->flags & IORING_SETUP_SQPOLL) {
|
||||
struct io_uring_task *tctx;
|
||||
struct task_struct *tsk;
|
||||
struct io_sq_data *sqd;
|
||||
bool attached;
|
||||
@@ -524,8 +525,13 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
|
||||
rcu_assign_pointer(sqd->thread, tsk);
|
||||
mutex_unlock(&sqd->lock);
|
||||
|
||||
ret = 0;
|
||||
get_task_struct(tsk);
|
||||
ret = io_uring_alloc_task_context(tsk, ctx);
|
||||
tctx = io_uring_alloc_task_context(tsk, ctx);
|
||||
if (!IS_ERR(tctx))
|
||||
tsk->io_uring = tctx;
|
||||
else
|
||||
ret = PTR_ERR(tctx);
|
||||
wake_up_new_task(tsk);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@@ -74,20 +74,20 @@ void __io_uring_free(struct task_struct *tsk)
|
||||
}
|
||||
}
|
||||
|
||||
__cold int io_uring_alloc_task_context(struct task_struct *task,
|
||||
struct io_ring_ctx *ctx)
|
||||
__cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task,
|
||||
struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_uring_task *tctx;
|
||||
int ret;
|
||||
|
||||
tctx = kzalloc_obj(*tctx);
|
||||
if (unlikely(!tctx))
|
||||
return -ENOMEM;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
|
||||
if (unlikely(ret)) {
|
||||
kfree(tctx);
|
||||
return ret;
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
tctx->io_wq = io_init_wq_offload(ctx, task);
|
||||
@@ -95,7 +95,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
|
||||
ret = PTR_ERR(tctx->io_wq);
|
||||
percpu_counter_destroy(&tctx->inflight);
|
||||
kfree(tctx);
|
||||
return ret;
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
tctx->task = task;
|
||||
@@ -103,31 +103,56 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
|
||||
init_waitqueue_head(&tctx->wait);
|
||||
atomic_set(&tctx->in_cancel, 0);
|
||||
atomic_set(&tctx->inflight_tracked, 0);
|
||||
task->io_uring = tctx;
|
||||
init_llist_head(&tctx->task_list);
|
||||
init_task_work(&tctx->task_work, tctx_task_work);
|
||||
return tctx;
|
||||
}
|
||||
|
||||
static int io_tctx_install_node(struct io_ring_ctx *ctx,
|
||||
struct io_uring_task *tctx)
|
||||
{
|
||||
struct io_tctx_node *node;
|
||||
int ret;
|
||||
|
||||
if (xa_load(&tctx->xa, (unsigned long)ctx))
|
||||
return 0;
|
||||
|
||||
node = kmalloc_obj(*node);
|
||||
if (!node)
|
||||
return -ENOMEM;
|
||||
node->ctx = ctx;
|
||||
node->task = current;
|
||||
|
||||
ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
|
||||
node, GFP_KERNEL));
|
||||
if (ret) {
|
||||
kfree(node);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mutex_lock(&ctx->tctx_lock);
|
||||
list_add(&node->ctx_node, &ctx->tctx_list);
|
||||
mutex_unlock(&ctx->tctx_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
struct io_tctx_node *node;
|
||||
int ret;
|
||||
|
||||
if (unlikely(!tctx)) {
|
||||
ret = io_uring_alloc_task_context(current, ctx);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
tctx = io_uring_alloc_task_context(current, ctx);
|
||||
if (IS_ERR(tctx))
|
||||
return PTR_ERR(tctx);
|
||||
|
||||
tctx = current->io_uring;
|
||||
if (ctx->iowq_limits_set) {
|
||||
if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
|
||||
unsigned int limits[2] = { ctx->iowq_limits[0],
|
||||
ctx->iowq_limits[1], };
|
||||
|
||||
ret = io_wq_max_workers(tctx->io_wq, limits);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto err_free;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,25 +163,19 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
|
||||
*/
|
||||
if (tctx->io_wq)
|
||||
io_wq_set_exit_on_idle(tctx->io_wq, false);
|
||||
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
|
||||
node = kmalloc_obj(*node);
|
||||
if (!node)
|
||||
return -ENOMEM;
|
||||
node->ctx = ctx;
|
||||
node->task = current;
|
||||
|
||||
ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
|
||||
node, GFP_KERNEL));
|
||||
if (ret) {
|
||||
kfree(node);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mutex_lock(&ctx->tctx_lock);
|
||||
list_add(&node->ctx_node, &ctx->tctx_list);
|
||||
mutex_unlock(&ctx->tctx_lock);
|
||||
ret = io_tctx_install_node(ctx, tctx);
|
||||
if (!ret) {
|
||||
current->io_uring = tctx;
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
if (!current->io_uring) {
|
||||
err_free:
|
||||
io_wq_put_and_exit(tctx->io_wq);
|
||||
percpu_counter_destroy(&tctx->inflight);
|
||||
kfree(tctx);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)
|
||||
|
||||
@@ -6,8 +6,8 @@ struct io_tctx_node {
|
||||
struct io_ring_ctx *ctx;
|
||||
};
|
||||
|
||||
int io_uring_alloc_task_context(struct task_struct *task,
|
||||
struct io_ring_ctx *ctx);
|
||||
struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task,
|
||||
struct io_ring_ctx *ctx);
|
||||
void io_uring_del_tctx_node(unsigned long index);
|
||||
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
|
||||
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx);
|
||||
|
||||
@@ -30,11 +30,30 @@ struct io_timeout_rem {
|
||||
u64 addr;
|
||||
|
||||
/* timeout update */
|
||||
struct timespec64 ts;
|
||||
ktime_t time;
|
||||
u32 flags;
|
||||
bool ltimeout;
|
||||
};
|
||||
|
||||
static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags)
|
||||
{
|
||||
struct timespec64 ts;
|
||||
|
||||
if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) {
|
||||
*time = ns_to_ktime(arg);
|
||||
if (*time < 0)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (get_timespec64(&ts, u64_to_user_ptr(arg)))
|
||||
return -EFAULT;
|
||||
if (ts.tv_sec < 0 || ts.tv_nsec < 0)
|
||||
return -EINVAL;
|
||||
*time = timespec64_to_ktime(ts);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
|
||||
struct io_kiocb *link);
|
||||
|
||||
@@ -80,7 +99,7 @@ static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw)
|
||||
/* re-arm timer */
|
||||
raw_spin_lock_irq(&ctx->timeout_lock);
|
||||
list_add(&timeout->list, ctx->timeout_list.prev);
|
||||
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
|
||||
hrtimer_start(&data->timer, data->time, data->mode);
|
||||
raw_spin_unlock_irq(&ctx->timeout_lock);
|
||||
return;
|
||||
}
|
||||
@@ -265,8 +284,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
|
||||
|
||||
raw_spin_lock_irqsave(&ctx->timeout_lock, flags);
|
||||
list_del_init(&timeout->list);
|
||||
atomic_set(&req->ctx->cq_timeouts,
|
||||
atomic_read(&req->ctx->cq_timeouts) + 1);
|
||||
atomic_set(&ctx->cq_timeouts,
|
||||
atomic_read(&ctx->cq_timeouts) + 1);
|
||||
raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags);
|
||||
|
||||
if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
|
||||
@@ -395,7 +414,7 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
|
||||
}
|
||||
|
||||
static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
|
||||
struct timespec64 *ts, enum hrtimer_mode mode)
|
||||
ktime_t ts, enum hrtimer_mode mode)
|
||||
__must_hold(&ctx->timeout_lock)
|
||||
{
|
||||
struct io_timeout_data *io;
|
||||
@@ -417,12 +436,12 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
|
||||
if (hrtimer_try_to_cancel(&io->timer) == -1)
|
||||
return -EALREADY;
|
||||
hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode);
|
||||
hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
|
||||
hrtimer_start(&io->timer, ts, mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
|
||||
struct timespec64 *ts, enum hrtimer_mode mode)
|
||||
ktime_t time, enum hrtimer_mode mode)
|
||||
__must_hold(&ctx->timeout_lock)
|
||||
{
|
||||
struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
|
||||
@@ -435,20 +454,23 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
|
||||
|
||||
timeout->off = 0; /* noseq */
|
||||
data = req->async_data;
|
||||
data->ts = *ts;
|
||||
data->time = time;
|
||||
|
||||
list_add_tail(&timeout->list, &ctx->timeout_list);
|
||||
hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode);
|
||||
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode);
|
||||
hrtimer_start(&data->timer, data->time, mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem);
|
||||
int ret;
|
||||
|
||||
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
|
||||
return -EINVAL;
|
||||
if (sqe->addr3 || sqe->__pad2[0])
|
||||
return -EINVAL;
|
||||
if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
|
||||
@@ -460,12 +482,13 @@ int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
return -EINVAL;
|
||||
if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
|
||||
tr->ltimeout = true;
|
||||
if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
|
||||
return -EINVAL;
|
||||
if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2))))
|
||||
return -EFAULT;
|
||||
if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
|
||||
if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK |
|
||||
IORING_TIMEOUT_ABS |
|
||||
IORING_TIMEOUT_IMMEDIATE_ARG))
|
||||
return -EINVAL;
|
||||
ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
} else if (tr->flags) {
|
||||
/* timeout removal doesn't support flags */
|
||||
return -EINVAL;
|
||||
@@ -500,9 +523,9 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
|
||||
|
||||
raw_spin_lock_irq(&ctx->timeout_lock);
|
||||
if (tr->ltimeout)
|
||||
ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
|
||||
ret = io_linked_timeout_update(ctx, tr->addr, tr->time, mode);
|
||||
else
|
||||
ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
|
||||
ret = io_timeout_update(ctx, tr->addr, tr->time, mode);
|
||||
raw_spin_unlock_irq(&ctx->timeout_lock);
|
||||
}
|
||||
|
||||
@@ -520,7 +543,10 @@ static int __io_timeout_prep(struct io_kiocb *req,
|
||||
struct io_timeout_data *data;
|
||||
unsigned flags;
|
||||
u32 off = READ_ONCE(sqe->off);
|
||||
int ret;
|
||||
|
||||
if (sqe->addr3 || sqe->__pad2[0])
|
||||
return -EINVAL;
|
||||
if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
|
||||
return -EINVAL;
|
||||
if (off && is_timeout_link)
|
||||
@@ -528,7 +554,8 @@ static int __io_timeout_prep(struct io_kiocb *req,
|
||||
flags = READ_ONCE(sqe->timeout_flags);
|
||||
if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
|
||||
IORING_TIMEOUT_ETIME_SUCCESS |
|
||||
IORING_TIMEOUT_MULTISHOT))
|
||||
IORING_TIMEOUT_MULTISHOT |
|
||||
IORING_TIMEOUT_IMMEDIATE_ARG))
|
||||
return -EINVAL;
|
||||
/* more than one clock specified is invalid, obviously */
|
||||
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
|
||||
@@ -539,8 +566,8 @@ static int __io_timeout_prep(struct io_kiocb *req,
|
||||
|
||||
INIT_LIST_HEAD(&timeout->list);
|
||||
timeout->off = off;
|
||||
if (unlikely(off && !req->ctx->off_timeout_used))
|
||||
req->ctx->off_timeout_used = true;
|
||||
if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED)))
|
||||
req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED;
|
||||
/*
|
||||
* for multishot reqs w/ fixed nr of repeats, repeats tracks the
|
||||
* remaining nr
|
||||
@@ -557,11 +584,9 @@ static int __io_timeout_prep(struct io_kiocb *req,
|
||||
data->req = req;
|
||||
data->flags = flags;
|
||||
|
||||
if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr))))
|
||||
return -EFAULT;
|
||||
|
||||
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
|
||||
return -EINVAL;
|
||||
ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
data->mode = io_translate_timeout_mode(flags);
|
||||
|
||||
@@ -637,7 +662,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
|
||||
}
|
||||
add:
|
||||
list_add(&timeout->list, entry);
|
||||
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
|
||||
hrtimer_start(&data->timer, data->time, data->mode);
|
||||
raw_spin_unlock_irq(&ctx->timeout_lock);
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
@@ -655,8 +680,7 @@ void io_queue_linked_timeout(struct io_kiocb *req)
|
||||
if (timeout->head) {
|
||||
struct io_timeout_data *data = req->async_data;
|
||||
|
||||
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
|
||||
data->mode);
|
||||
hrtimer_start(&data->timer, data->time, data->mode);
|
||||
list_add_tail(&timeout->list, &ctx->ltimeout_list);
|
||||
}
|
||||
raw_spin_unlock_irq(&ctx->timeout_lock);
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
struct io_timeout_data {
|
||||
struct io_kiocb *req;
|
||||
struct hrtimer timer;
|
||||
struct timespec64 ts;
|
||||
ktime_t time;
|
||||
enum hrtimer_mode mode;
|
||||
u32 flags;
|
||||
};
|
||||
|
||||
@@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
|
||||
|
||||
if (!head) {
|
||||
io_ctx_mark_taskrun(ctx);
|
||||
if (ctx->has_evfd)
|
||||
if (data_race(ctx->int_flags) & IO_RING_F_HAS_EVFD)
|
||||
io_eventfd_signal(ctx, false);
|
||||
}
|
||||
|
||||
|
||||
@@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
|
||||
* because iopoll completion data overlaps with the hash_node used
|
||||
* for tracking.
|
||||
*/
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL)
|
||||
if (req->flags & REQ_F_IOPOLL)
|
||||
return;
|
||||
|
||||
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
|
||||
@@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2,
|
||||
io_req_set_cqe32_extra(req, res2, 0);
|
||||
}
|
||||
io_req_uring_cleanup(req, issue_flags);
|
||||
if (req->ctx->flags & IORING_SETUP_IOPOLL) {
|
||||
if (req->flags & REQ_F_IOPOLL) {
|
||||
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
|
||||
smp_store_release(&req->iopoll_completed, 1);
|
||||
} else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
|
||||
@@ -257,9 +257,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
|
||||
issue_flags |= IO_URING_F_CQE32;
|
||||
if (io_is_compat(ctx))
|
||||
issue_flags |= IO_URING_F_COMPAT;
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL) {
|
||||
if (!file->f_op->uring_cmd_iopoll)
|
||||
return -EOPNOTSUPP;
|
||||
if (ctx->flags & IORING_SETUP_IOPOLL && file->f_op->uring_cmd_iopoll) {
|
||||
req->flags |= REQ_F_IOPOLL;
|
||||
issue_flags |= IO_URING_F_IOPOLL;
|
||||
req->iopoll_completed = 0;
|
||||
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
|
||||
|
||||
@@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
|
||||
struct ext_arg *ext_arg);
|
||||
int io_run_task_work_sig(struct io_ring_ctx *ctx);
|
||||
void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx);
|
||||
void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx);
|
||||
|
||||
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
|
||||
{
|
||||
|
||||
392
io_uring/zcrx.c
392
io_uring/zcrx.c
@@ -63,7 +63,7 @@ static int io_area_max_shift(struct io_zcrx_mem *mem)
|
||||
unsigned i;
|
||||
|
||||
for_each_sgtable_dma_sg(sgt, sg, i)
|
||||
shift = min(shift, __ffs(sg->length));
|
||||
shift = min(shift, __ffs(sg_dma_len(sg)));
|
||||
return shift;
|
||||
}
|
||||
|
||||
@@ -127,10 +127,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
|
||||
int dmabuf_fd = area_reg->dmabuf_fd;
|
||||
int i, ret;
|
||||
|
||||
if (!ifq->dev)
|
||||
return -EINVAL;
|
||||
if (off)
|
||||
return -EINVAL;
|
||||
if (WARN_ON_ONCE(!ifq->dev))
|
||||
return -EFAULT;
|
||||
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
|
||||
return -EINVAL;
|
||||
|
||||
@@ -194,6 +194,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
|
||||
{
|
||||
struct page **pages;
|
||||
int nr_pages, ret;
|
||||
bool mapped = false;
|
||||
|
||||
if (area_reg->dmabuf_fd)
|
||||
return -EINVAL;
|
||||
@@ -207,22 +208,37 @@ static int io_import_umem(struct io_zcrx_ifq *ifq,
|
||||
ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
|
||||
0, (unsigned long)nr_pages << PAGE_SHIFT,
|
||||
GFP_KERNEL_ACCOUNT);
|
||||
if (ret) {
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
kvfree(pages);
|
||||
return ret;
|
||||
if (ret)
|
||||
goto out_err;
|
||||
|
||||
if (ifq->dev) {
|
||||
ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table,
|
||||
DMA_FROM_DEVICE, IO_DMA_ATTR);
|
||||
if (ret < 0)
|
||||
goto out_err;
|
||||
mapped = true;
|
||||
}
|
||||
|
||||
mem->account_pages = io_count_account_pages(pages, nr_pages);
|
||||
ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages);
|
||||
if (ret < 0)
|
||||
if (ret < 0) {
|
||||
mem->account_pages = 0;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
mem->sgt = &mem->page_sg_table;
|
||||
mem->pages = pages;
|
||||
mem->nr_folios = nr_pages;
|
||||
mem->size = area_reg->len;
|
||||
return ret;
|
||||
out_err:
|
||||
if (mapped)
|
||||
dma_unmap_sgtable(ifq->dev, &mem->page_sg_table,
|
||||
DMA_FROM_DEVICE, IO_DMA_ATTR);
|
||||
sg_free_table(&mem->page_sg_table);
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
kvfree(pages);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void io_release_area_mem(struct io_zcrx_mem *mem)
|
||||
@@ -273,8 +289,10 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
|
||||
return;
|
||||
area->is_mapped = false;
|
||||
|
||||
for (i = 0; i < area->nia.num_niovs; i++)
|
||||
net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
|
||||
if (area->nia.niovs) {
|
||||
for (i = 0; i < area->nia.num_niovs; i++)
|
||||
net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
|
||||
}
|
||||
|
||||
if (area->mem.is_dmabuf) {
|
||||
io_release_dmabuf(&area->mem);
|
||||
@@ -284,45 +302,23 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
|
||||
}
|
||||
}
|
||||
|
||||
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
|
||||
{
|
||||
int ret;
|
||||
|
||||
guard(mutex)(&ifq->pp_lock);
|
||||
if (area->is_mapped)
|
||||
return 0;
|
||||
|
||||
if (!area->mem.is_dmabuf) {
|
||||
ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
|
||||
DMA_FROM_DEVICE, IO_DMA_ATTR);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = io_populate_area_dma(ifq, area);
|
||||
if (ret && !area->mem.is_dmabuf)
|
||||
dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
|
||||
DMA_FROM_DEVICE, IO_DMA_ATTR);
|
||||
if (ret == 0)
|
||||
area->is_mapped = true;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void io_zcrx_sync_for_device(struct page_pool *pool,
|
||||
struct net_iov *niov)
|
||||
static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx,
|
||||
netmem_ref *netmems, unsigned nr)
|
||||
{
|
||||
#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
|
||||
struct device *dev = pp->p.dev;
|
||||
unsigned i, niov_size;
|
||||
dma_addr_t dma_addr;
|
||||
|
||||
unsigned niov_size;
|
||||
|
||||
if (!dma_dev_need_sync(pool->p.dev))
|
||||
if (!dma_dev_need_sync(dev))
|
||||
return;
|
||||
niov_size = 1U << zcrx->niov_shift;
|
||||
|
||||
niov_size = 1U << io_pp_to_ifq(pool)->niov_shift;
|
||||
dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
|
||||
__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
|
||||
niov_size, pool->p.dma_dir);
|
||||
for (i = 0; i < nr; i++) {
|
||||
dma_addr = page_pool_get_dma_addr_netmem(netmems[i]);
|
||||
__dma_sync_single_for_device(dev, dma_addr + pp->p.offset,
|
||||
niov_size, pp->p.dma_dir);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -390,24 +386,24 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
|
||||
return -EINVAL;
|
||||
|
||||
mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
|
||||
mmap_offset += id << IORING_OFF_PBUF_SHIFT;
|
||||
mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT;
|
||||
|
||||
ret = io_create_region(ctx, &ifq->region, rd, mmap_offset);
|
||||
ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ptr = io_region_get_ptr(&ifq->region);
|
||||
ifq->rq_ring = (struct io_uring *)ptr;
|
||||
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
|
||||
ptr = io_region_get_ptr(&ifq->rq_region);
|
||||
ifq->rq.ring = (struct io_uring *)ptr;
|
||||
ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
|
||||
{
|
||||
io_free_region(ifq->user, &ifq->region);
|
||||
ifq->rq_ring = NULL;
|
||||
ifq->rqes = NULL;
|
||||
io_free_region(ifq->user, &ifq->rq_region);
|
||||
ifq->rq.ring = NULL;
|
||||
ifq->rq.rqes = NULL;
|
||||
}
|
||||
|
||||
static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
|
||||
@@ -429,8 +425,13 @@ static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
|
||||
static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
|
||||
struct io_zcrx_area *area)
|
||||
{
|
||||
if (ifq->area)
|
||||
bool kern_readable = !area->mem.is_dmabuf;
|
||||
|
||||
if (WARN_ON_ONCE(ifq->area))
|
||||
return -EINVAL;
|
||||
if (WARN_ON_ONCE(ifq->kern_readable != kern_readable))
|
||||
return -EINVAL;
|
||||
|
||||
ifq->area = area;
|
||||
return 0;
|
||||
}
|
||||
@@ -450,6 +451,8 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
|
||||
return -EINVAL;
|
||||
buf_size_shift = ilog2(reg->rx_buf_len);
|
||||
}
|
||||
if (!ifq->dev && buf_size_shift != PAGE_SHIFT)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = -ENOMEM;
|
||||
area = kzalloc_obj(*area);
|
||||
@@ -460,8 +463,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
|
||||
ret = io_import_area(ifq, &area->mem, area_reg);
|
||||
if (ret)
|
||||
goto err;
|
||||
if (ifq->dev)
|
||||
area->is_mapped = true;
|
||||
|
||||
if (buf_size_shift > io_area_max_shift(&area->mem)) {
|
||||
if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) {
|
||||
ret = -ERANGE;
|
||||
goto err;
|
||||
}
|
||||
@@ -495,6 +500,12 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
|
||||
niov->type = NET_IOV_IOURING;
|
||||
}
|
||||
|
||||
if (ifq->dev) {
|
||||
ret = io_populate_area_dma(ifq, area);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
area->free_count = nr_iovs;
|
||||
/* we're only supporting one area per ifq for now */
|
||||
area->area_id = 0;
|
||||
@@ -519,7 +530,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
|
||||
return NULL;
|
||||
|
||||
ifq->if_rxq = -1;
|
||||
spin_lock_init(&ifq->rq_lock);
|
||||
spin_lock_init(&ifq->rq.lock);
|
||||
mutex_init(&ifq->pp_lock);
|
||||
refcount_set(&ifq->refs, 1);
|
||||
refcount_set(&ifq->user_refs, 1);
|
||||
@@ -586,9 +597,21 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
|
||||
{
|
||||
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
|
||||
|
||||
spin_lock_bh(&area->freelist_lock);
|
||||
guard(spinlock_bh)(&area->freelist_lock);
|
||||
area->freelist[area->free_count++] = net_iov_idx(niov);
|
||||
spin_unlock_bh(&area->freelist_lock);
|
||||
}
|
||||
|
||||
static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area)
|
||||
{
|
||||
unsigned niov_idx;
|
||||
|
||||
lockdep_assert_held(&area->freelist_lock);
|
||||
|
||||
if (unlikely(!area->free_count))
|
||||
return NULL;
|
||||
|
||||
niov_idx = area->freelist[--area->free_count];
|
||||
return &area->nia.niovs[niov_idx];
|
||||
}
|
||||
|
||||
static void io_zcrx_return_niov(struct net_iov *niov)
|
||||
@@ -624,12 +647,17 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
|
||||
}
|
||||
}
|
||||
|
||||
static void zcrx_unregister(struct io_zcrx_ifq *ifq)
|
||||
static void zcrx_unregister_user(struct io_zcrx_ifq *ifq)
|
||||
{
|
||||
if (refcount_dec_and_test(&ifq->user_refs)) {
|
||||
io_close_queue(ifq);
|
||||
io_zcrx_scrub(ifq);
|
||||
}
|
||||
}
|
||||
|
||||
static void zcrx_unregister(struct io_zcrx_ifq *ifq)
|
||||
{
|
||||
zcrx_unregister_user(ifq);
|
||||
io_put_zcrx_ifq(ifq);
|
||||
}
|
||||
|
||||
@@ -640,7 +668,7 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
|
||||
|
||||
lockdep_assert_held(&ctx->mmap_lock);
|
||||
|
||||
return ifq ? &ifq->region : NULL;
|
||||
return ifq ? &ifq->rq_region : NULL;
|
||||
}
|
||||
|
||||
static int zcrx_box_release(struct inode *inode, struct file *file)
|
||||
@@ -751,10 +779,50 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg)
|
||||
static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
|
||||
struct io_uring_zcrx_ifq_reg *reg,
|
||||
struct io_uring_zcrx_area_reg *area)
|
||||
{
|
||||
struct pp_memory_provider_params mp_param = {};
|
||||
unsigned if_rxq = reg->if_rxq;
|
||||
int ret;
|
||||
|
||||
ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns,
|
||||
reg->if_idx);
|
||||
if (!ifq->netdev)
|
||||
return -ENODEV;
|
||||
|
||||
netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
|
||||
|
||||
ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq);
|
||||
if (!ifq->dev) {
|
||||
ret = -EOPNOTSUPP;
|
||||
goto netdev_put_unlock;
|
||||
}
|
||||
get_device(ifq->dev);
|
||||
|
||||
ret = io_zcrx_create_area(ifq, area, reg);
|
||||
if (ret)
|
||||
goto netdev_put_unlock;
|
||||
|
||||
if (reg->rx_buf_len)
|
||||
mp_param.rx_page_size = 1U << ifq->niov_shift;
|
||||
mp_param.mp_ops = &io_uring_pp_zc_ops;
|
||||
mp_param.mp_priv = ifq;
|
||||
ret = __net_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL);
|
||||
if (ret)
|
||||
goto netdev_put_unlock;
|
||||
|
||||
ifq->if_rxq = if_rxq;
|
||||
ret = 0;
|
||||
netdev_put_unlock:
|
||||
netdev_unlock(ifq->netdev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_register_zcrx(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg)
|
||||
{
|
||||
struct io_uring_zcrx_area_reg area;
|
||||
struct io_uring_zcrx_ifq_reg reg;
|
||||
struct io_uring_region_desc rd;
|
||||
@@ -778,11 +846,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
return -EFAULT;
|
||||
if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id)
|
||||
return -EINVAL;
|
||||
if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS)
|
||||
return -EINVAL;
|
||||
if (reg.flags & ZCRX_REG_IMPORT)
|
||||
return import_zcrx(ctx, arg, ®);
|
||||
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
|
||||
return -EFAULT;
|
||||
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
|
||||
if (reg.if_rxq == -1 || !reg.rq_entries)
|
||||
return -EINVAL;
|
||||
if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV))
|
||||
return -EINVAL;
|
||||
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
|
||||
if (!(ctx->flags & IORING_SETUP_CLAMP))
|
||||
@@ -806,7 +878,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
mmgrab(ctx->mm_account);
|
||||
ifq->mm_account = ctx->mm_account;
|
||||
}
|
||||
ifq->rq_entries = reg.rq_entries;
|
||||
ifq->rq.nr_entries = reg.rq_entries;
|
||||
|
||||
scoped_guard(mutex, &ctx->mmap_lock) {
|
||||
/* preallocate id */
|
||||
@@ -819,33 +891,17 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx);
|
||||
if (!ifq->netdev) {
|
||||
ret = -ENODEV;
|
||||
goto err;
|
||||
ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
|
||||
|
||||
if (!(reg.flags & ZCRX_REG_NODEV)) {
|
||||
ret = zcrx_register_netdev(ifq, ®, &area);
|
||||
if (ret)
|
||||
goto err;
|
||||
} else {
|
||||
ret = io_zcrx_create_area(ifq, &area, ®);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
|
||||
|
||||
ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq);
|
||||
if (!ifq->dev) {
|
||||
ret = -EOPNOTSUPP;
|
||||
goto netdev_put_unlock;
|
||||
}
|
||||
get_device(ifq->dev);
|
||||
|
||||
ret = io_zcrx_create_area(ifq, &area, ®);
|
||||
if (ret)
|
||||
goto netdev_put_unlock;
|
||||
|
||||
if (reg.rx_buf_len)
|
||||
mp_param.rx_page_size = 1U << ifq->niov_shift;
|
||||
mp_param.mp_ops = &io_uring_pp_zc_ops;
|
||||
mp_param.mp_priv = ifq;
|
||||
ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
|
||||
if (ret)
|
||||
goto netdev_put_unlock;
|
||||
netdev_unlock(ifq->netdev);
|
||||
ifq->if_rxq = reg.if_rxq;
|
||||
|
||||
reg.zcrx_id = id;
|
||||
|
||||
@@ -865,8 +921,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
goto err;
|
||||
}
|
||||
return 0;
|
||||
netdev_put_unlock:
|
||||
netdev_unlock(ifq->netdev);
|
||||
err:
|
||||
scoped_guard(mutex, &ctx->mmap_lock)
|
||||
xa_erase(&ctx->zcrx_ctxs, id);
|
||||
@@ -875,17 +929,37 @@ ifq_free:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
|
||||
static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id)
|
||||
{
|
||||
unsigned niov_idx;
|
||||
|
||||
lockdep_assert_held(&area->freelist_lock);
|
||||
|
||||
niov_idx = area->freelist[--area->free_count];
|
||||
return &area->nia.niovs[niov_idx];
|
||||
return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
|
||||
}
|
||||
|
||||
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
|
||||
static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id)
|
||||
{
|
||||
xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
|
||||
}
|
||||
|
||||
void io_terminate_zcrx(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq;
|
||||
unsigned long id = 0;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
while (1) {
|
||||
scoped_guard(mutex, &ctx->mmap_lock)
|
||||
ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
|
||||
if (!ifq)
|
||||
break;
|
||||
if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id)))
|
||||
break;
|
||||
set_zcrx_entry_mark(ctx, id);
|
||||
id++;
|
||||
zcrx_unregister_user(ifq);
|
||||
}
|
||||
}
|
||||
|
||||
void io_unregister_zcrx(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq;
|
||||
|
||||
@@ -896,31 +970,35 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
|
||||
unsigned long id = 0;
|
||||
|
||||
ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
|
||||
if (ifq)
|
||||
if (ifq) {
|
||||
if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) {
|
||||
ifq = NULL;
|
||||
break;
|
||||
}
|
||||
xa_erase(&ctx->zcrx_ctxs, id);
|
||||
}
|
||||
}
|
||||
if (!ifq)
|
||||
break;
|
||||
zcrx_unregister(ifq);
|
||||
io_put_zcrx_ifq(ifq);
|
||||
}
|
||||
|
||||
xa_destroy(&ctx->zcrx_ctxs);
|
||||
}
|
||||
|
||||
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
|
||||
static inline u32 zcrx_rq_entries(struct zcrx_rq *rq)
|
||||
{
|
||||
u32 entries;
|
||||
|
||||
entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
|
||||
return min(entries, ifq->rq_entries);
|
||||
entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head;
|
||||
return min(entries, rq->nr_entries);
|
||||
}
|
||||
|
||||
static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
|
||||
unsigned mask)
|
||||
static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask)
|
||||
{
|
||||
unsigned int idx = ifq->cached_rq_head++ & mask;
|
||||
unsigned int idx = rq->cached_head++ & mask;
|
||||
|
||||
return &ifq->rqes[idx];
|
||||
return &rq->rqes[idx];
|
||||
}
|
||||
|
||||
static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
|
||||
@@ -946,21 +1024,24 @@ static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void io_zcrx_ring_refill(struct page_pool *pp,
|
||||
struct io_zcrx_ifq *ifq)
|
||||
static unsigned io_zcrx_ring_refill(struct page_pool *pp,
|
||||
struct io_zcrx_ifq *ifq,
|
||||
netmem_ref *netmems, unsigned to_alloc)
|
||||
{
|
||||
unsigned int mask = ifq->rq_entries - 1;
|
||||
struct zcrx_rq *rq = &ifq->rq;
|
||||
unsigned int mask = rq->nr_entries - 1;
|
||||
unsigned int entries;
|
||||
unsigned allocated = 0;
|
||||
|
||||
guard(spinlock_bh)(&ifq->rq_lock);
|
||||
guard(spinlock_bh)(&rq->lock);
|
||||
|
||||
entries = io_zcrx_rqring_entries(ifq);
|
||||
entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL);
|
||||
entries = zcrx_rq_entries(rq);
|
||||
entries = min_t(unsigned, entries, to_alloc);
|
||||
if (unlikely(!entries))
|
||||
return;
|
||||
return 0;
|
||||
|
||||
do {
|
||||
struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
|
||||
struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
|
||||
struct net_iov *niov;
|
||||
netmem_ref netmem;
|
||||
|
||||
@@ -978,46 +1059,56 @@ static void io_zcrx_ring_refill(struct page_pool *pp,
|
||||
continue;
|
||||
}
|
||||
|
||||
io_zcrx_sync_for_device(pp, niov);
|
||||
net_mp_netmem_place_in_cache(pp, netmem);
|
||||
netmems[allocated] = netmem;
|
||||
allocated++;
|
||||
} while (--entries);
|
||||
|
||||
smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
|
||||
smp_store_release(&rq->ring->head, rq->cached_head);
|
||||
return allocated;
|
||||
}
|
||||
|
||||
static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
|
||||
static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq,
|
||||
netmem_ref *netmems, unsigned to_alloc)
|
||||
{
|
||||
struct io_zcrx_area *area = ifq->area;
|
||||
unsigned allocated = 0;
|
||||
|
||||
spin_lock_bh(&area->freelist_lock);
|
||||
while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
|
||||
struct net_iov *niov = __io_zcrx_get_free_niov(area);
|
||||
netmem_ref netmem = net_iov_to_netmem(niov);
|
||||
guard(spinlock_bh)(&area->freelist_lock);
|
||||
|
||||
for (allocated = 0; allocated < to_alloc; allocated++) {
|
||||
struct net_iov *niov = zcrx_get_free_niov(area);
|
||||
|
||||
if (!niov)
|
||||
break;
|
||||
net_mp_niov_set_page_pool(pp, niov);
|
||||
io_zcrx_sync_for_device(pp, niov);
|
||||
net_mp_netmem_place_in_cache(pp, netmem);
|
||||
netmems[allocated] = net_iov_to_netmem(niov);
|
||||
}
|
||||
spin_unlock_bh(&area->freelist_lock);
|
||||
return allocated;
|
||||
}
|
||||
|
||||
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
|
||||
netmem_ref *netmems = pp->alloc.cache;
|
||||
unsigned to_alloc = PP_ALLOC_CACHE_REFILL;
|
||||
unsigned allocated;
|
||||
|
||||
/* pp should already be ensuring that */
|
||||
if (unlikely(pp->alloc.count))
|
||||
if (WARN_ON_ONCE(pp->alloc.count))
|
||||
return 0;
|
||||
|
||||
allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc);
|
||||
if (likely(allocated))
|
||||
goto out_return;
|
||||
|
||||
io_zcrx_ring_refill(pp, ifq);
|
||||
if (likely(pp->alloc.count))
|
||||
goto out_return;
|
||||
|
||||
io_zcrx_refill_slow(pp, ifq);
|
||||
if (!pp->alloc.count)
|
||||
allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
|
||||
if (!allocated)
|
||||
return 0;
|
||||
out_return:
|
||||
return pp->alloc.cache[--pp->alloc.count];
|
||||
zcrx_sync_for_device(pp, ifq, netmems, allocated);
|
||||
allocated--;
|
||||
pp->alloc.count += allocated;
|
||||
return netmems[allocated];
|
||||
}
|
||||
|
||||
static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
|
||||
@@ -1036,7 +1127,6 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
|
||||
static int io_pp_zc_init(struct page_pool *pp)
|
||||
{
|
||||
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
|
||||
int ret;
|
||||
|
||||
if (WARN_ON_ONCE(!ifq))
|
||||
return -EINVAL;
|
||||
@@ -1049,10 +1139,6 @@ static int io_pp_zc_init(struct page_pool *pp)
|
||||
if (pp->p.dma_dir != DMA_FROM_DEVICE)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = io_zcrx_map_area(ifq, ifq->area);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
refcount_inc(&ifq->refs);
|
||||
return 0;
|
||||
}
|
||||
@@ -1100,14 +1186,14 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
|
||||
};
|
||||
|
||||
static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
|
||||
struct io_zcrx_ifq *zcrx)
|
||||
struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq)
|
||||
{
|
||||
unsigned int mask = zcrx->rq_entries - 1;
|
||||
unsigned int mask = rq->nr_entries - 1;
|
||||
unsigned int i;
|
||||
|
||||
nr = min(nr, io_zcrx_rqring_entries(zcrx));
|
||||
nr = min(nr, zcrx_rq_entries(rq));
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
|
||||
struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
|
||||
struct net_iov *niov;
|
||||
|
||||
if (!io_parse_rqe(rqe, zcrx, &niov))
|
||||
@@ -1115,7 +1201,7 @@ static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
|
||||
netmem_array[i] = net_iov_to_netmem(niov);
|
||||
}
|
||||
|
||||
smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
|
||||
smp_store_release(&rq->ring->head, rq->cached_head);
|
||||
return i;
|
||||
}
|
||||
|
||||
@@ -1149,8 +1235,10 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
|
||||
return -EINVAL;
|
||||
|
||||
do {
|
||||
scoped_guard(spinlock_bh, &zcrx->rq_lock) {
|
||||
nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
|
||||
struct zcrx_rq *rq = &zcrx->rq;
|
||||
|
||||
scoped_guard(spinlock_bh, &rq->lock) {
|
||||
nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq);
|
||||
zcrx_return_buffers(netmems, nr);
|
||||
}
|
||||
|
||||
@@ -1159,7 +1247,7 @@ static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
|
||||
if (fatal_signal_pending(current))
|
||||
break;
|
||||
cond_resched();
|
||||
} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
|
||||
} while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1169,6 +1257,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
|
||||
struct zcrx_ctrl ctrl;
|
||||
struct io_zcrx_ifq *zcrx;
|
||||
|
||||
BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
|
||||
|
||||
if (nr_args)
|
||||
return -EINVAL;
|
||||
if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
|
||||
@@ -1221,13 +1311,11 @@ static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq)
|
||||
struct io_zcrx_area *area = ifq->area;
|
||||
struct net_iov *niov = NULL;
|
||||
|
||||
if (area->mem.is_dmabuf)
|
||||
if (!ifq->kern_readable)
|
||||
return NULL;
|
||||
|
||||
spin_lock_bh(&area->freelist_lock);
|
||||
if (area->free_count)
|
||||
niov = __io_zcrx_get_free_niov(area);
|
||||
spin_unlock_bh(&area->freelist_lock);
|
||||
scoped_guard(spinlock_bh, &area->freelist_lock)
|
||||
niov = zcrx_get_free_niov(area);
|
||||
|
||||
if (niov)
|
||||
page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include <net/page_pool/types.h>
|
||||
#include <net/net_trackers.h>
|
||||
|
||||
#define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV)
|
||||
#define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE)
|
||||
|
||||
struct io_zcrx_mem {
|
||||
unsigned long size;
|
||||
bool is_dmabuf;
|
||||
@@ -38,17 +41,22 @@ struct io_zcrx_area {
|
||||
struct io_zcrx_mem mem;
|
||||
};
|
||||
|
||||
struct zcrx_rq {
|
||||
spinlock_t lock;
|
||||
struct io_uring *ring;
|
||||
struct io_uring_zcrx_rqe *rqes;
|
||||
u32 cached_head;
|
||||
u32 nr_entries;
|
||||
};
|
||||
|
||||
struct io_zcrx_ifq {
|
||||
struct io_zcrx_area *area;
|
||||
unsigned niov_shift;
|
||||
struct user_struct *user;
|
||||
struct mm_struct *mm_account;
|
||||
bool kern_readable;
|
||||
|
||||
spinlock_t rq_lock ____cacheline_aligned_in_smp;
|
||||
struct io_uring *rq_ring;
|
||||
struct io_uring_zcrx_rqe *rqes;
|
||||
u32 cached_rq_head;
|
||||
u32 rq_entries;
|
||||
struct zcrx_rq rq ____cacheline_aligned_in_smp;
|
||||
|
||||
u32 if_rxq;
|
||||
struct device *dev;
|
||||
@@ -63,26 +71,30 @@ struct io_zcrx_ifq {
|
||||
* net stack.
|
||||
*/
|
||||
struct mutex pp_lock;
|
||||
struct io_mapped_region region;
|
||||
struct io_mapped_region rq_region;
|
||||
};
|
||||
|
||||
#if defined(CONFIG_IO_URING_ZCRX)
|
||||
int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
|
||||
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
int io_register_zcrx(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg);
|
||||
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
|
||||
void io_unregister_zcrx(struct io_ring_ctx *ctx);
|
||||
void io_terminate_zcrx(struct io_ring_ctx *ctx);
|
||||
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
|
||||
struct socket *sock, unsigned int flags,
|
||||
unsigned issue_flags, unsigned int *len);
|
||||
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
|
||||
unsigned int id);
|
||||
#else
|
||||
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg)
|
||||
static inline int io_register_zcrx(struct io_ring_ctx *ctx,
|
||||
struct io_uring_zcrx_ifq_reg __user *arg)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
|
||||
static inline void io_unregister_zcrx(struct io_ring_ctx *ctx)
|
||||
{
|
||||
}
|
||||
static inline void io_terminate_zcrx(struct io_ring_ctx *ctx)
|
||||
{
|
||||
}
|
||||
static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
|
||||
|
||||
Reference in New Issue
Block a user