Files
linux/tools/testing/selftests/ublk/kublk.h
Linus Torvalds 0c00ed308d Merge tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe:

 - Support for batch request processing for ublk, improving the
   efficiency of the kernel/ublk server communication. This can yield
   nice 7-12% performance improvements

 - Support for integrity data for ublk

 - Various other ublk improvements and additions, including a ton of
   selftests additions and updated

 - Move the handling of blk-crypto software fallback from below the
   block layer to above it. This reduces the complexity of dealing with
   bio splitting

 - Series fixing a number of potential deadlocks in blk-mq related to
   the queue usage counter and writeback throttling and rq-qos debugfs
   handling

 - Add an async_depth queue attribute, to resolve a performance
   regression that's been around for a qhilw related to the scheduler
   depth handling

 - Only use task_work for IOPOLL completions on NVMe, if it is necessary
   to do so. An earlier fix for an issue resulted in all these
   completions being punted to task_work, to guarantee that completions
   were only run for a given io_uring ring when it was local to that
   ring. With the new changes, we can detect if it's necessary to use
   task_work or not, and avoid it if possible.

 - rnbd fixes:
      - Fix refcount underflow in device unmap path
      - Handle PREFLUSH and NOUNMAP flags properly in protocol
      - Fix server-side bi_size for special IOs
      - Zero response buffer before use
      - Fix trace format for flags
      - Add .release to rnbd_dev_ktype

 - MD pull requests via Yu Kuai
      - Fix raid5_run() to return error when log_init() fails
      - Fix IO hang with degraded array with llbitmap
      - Fix percpu_ref not resurrected on suspend timeout in llbitmap
      - Fix GPF in write_page caused by resize race
      - Fix NULL pointer dereference in process_metadata_update
      - Fix hang when stopping arrays with metadata through dm-raid
      - Fix any_working flag handling in raid10_sync_request
      - Refactor sync/recovery code path, improve error handling for
        badblocks, and remove unused recovery_disabled field
      - Consolidate mddev boolean fields into mddev_flags
      - Use mempool to allocate stripe_request_ctx and make sure
        max_sectors is not less than io_opt in raid5
      - Fix return value of mddev_trylock
      - Fix memory leak in raid1_run()
      - Add Li Nan as mdraid reviewer

 - Move phys_vec definitions to the kernel types, mostly in preparation
   for some VFIO and RDMA changes

 - Improve the speed for secure erase for some devices

 - Various little rust updates

 - Various other minor fixes, improvements, and cleanups

* tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (162 commits)
  blk-mq: ABI/sysfs-block: fix docs build warnings
  selftests: ublk: organize test directories by test ID
  block: decouple secure erase size limit from discard size limit
  block: remove redundant kill_bdev() call in set_blocksize()
  blk-mq: add documentation for new queue attribute async_dpeth
  block, bfq: convert to use request_queue->async_depth
  mq-deadline: covert to use request_queue->async_depth
  kyber: covert to use request_queue->async_depth
  blk-mq: add a new queue sysfs attribute async_depth
  blk-mq: factor out a helper blk_mq_limit_depth()
  blk-mq-sched: unify elevators checking for async requests
  block: convert nr_requests to unsigned int
  block: don't use strcpy to copy blockdev name
  blk-mq-debugfs: warn about possible deadlock
  blk-mq-debugfs: add missing debugfs_mutex in blk_mq_debugfs_register_hctxs()
  blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos()
  blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
  blk-rq-qos: fix possible debugfs_mutex deadlock
  blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
  blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
  ...
2026-02-09 17:57:21 -08:00

611 lines
15 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef KUBLK_INTERNAL_H
#define KUBLK_INTERNAL_H
#include <unistd.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <pthread.h>
#include <getopt.h>
#include <limits.h>
#include <poll.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/inotify.h>
#include <sys/wait.h>
#include <sys/eventfd.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <linux/io_uring.h>
#include <liburing.h>
#include <semaphore.h>
/* allow ublk_dep.h to override ublk_cmd.h */
#include "ublk_dep.h"
#include <linux/ublk_cmd.h>
#include "utils.h"
#define MAX_BACK_FILES 4
/****************** part 1: libublk ********************/
#define CTRL_DEV "/dev/ublk-control"
#define UBLKC_DEV "/dev/ublkc"
#define UBLKB_DEV "/dev/ublkb"
#define UBLK_CTRL_RING_DEPTH 32
#define ERROR_EVTFD_DEVID -2
#define UBLK_IO_MAX_BYTES (1 << 20)
#define UBLK_MAX_QUEUES_SHIFT 5
#define UBLK_MAX_QUEUES (1 << UBLK_MAX_QUEUES_SHIFT)
#define UBLK_MAX_THREADS_SHIFT 5
#define UBLK_MAX_THREADS (1 << UBLK_MAX_THREADS_SHIFT)
#define UBLK_QUEUE_DEPTH 1024
struct ublk_dev;
struct ublk_queue;
struct ublk_thread;
struct stripe_ctx {
/* stripe */
unsigned int chunk_size;
};
struct fault_inject_ctx {
/* fault_inject */
unsigned long delay_us;
};
struct dev_ctx {
char tgt_type[16];
unsigned long flags;
unsigned nr_hw_queues;
unsigned short nthreads;
unsigned queue_depth;
int dev_id;
int nr_files;
char *files[MAX_BACK_FILES];
unsigned int logging:1;
unsigned int all:1;
unsigned int fg:1;
unsigned int recovery:1;
unsigned int auto_zc_fallback:1;
unsigned int per_io_tasks:1;
unsigned int no_ublk_fixed_fd:1;
unsigned int safe_stop:1;
unsigned int no_auto_part_scan:1;
__u32 integrity_flags;
__u8 metadata_size;
__u8 pi_offset;
__u8 csum_type;
__u8 tag_size;
int _evtfd;
int _shmid;
/* built from shmem, only for ublk_dump_dev() */
struct ublk_dev *shadow_dev;
/* for 'update_size' command */
unsigned long long size;
union {
struct stripe_ctx stripe;
struct fault_inject_ctx fault_inject;
};
};
struct ublk_ctrl_cmd_data {
__u32 cmd_op;
#define CTRL_CMD_HAS_DATA 1
#define CTRL_CMD_HAS_BUF 2
__u32 flags;
__u64 data[2];
__u64 addr;
__u32 len;
};
struct ublk_io {
char *buf_addr;
void *integrity_buf;
#define UBLKS_IO_NEED_FETCH_RQ (1UL << 0)
#define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1)
#define UBLKS_IO_FREE (1UL << 2)
#define UBLKS_IO_NEED_GET_DATA (1UL << 3)
#define UBLKS_IO_NEED_REG_BUF (1UL << 4)
unsigned short flags;
unsigned short refs; /* used by target code only */
int tag;
int result;
unsigned short buf_index;
unsigned short tgt_ios;
void *private_data;
};
struct ublk_tgt_ops {
const char *name;
int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *);
void (*deinit_tgt)(struct ublk_dev *);
int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag);
void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *,
const struct io_uring_cqe *);
/*
* Target specific command line handling
*
* each option requires argument for target command line
*/
void (*parse_cmd_line)(struct dev_ctx *ctx, int argc, char *argv[]);
void (*usage)(const struct ublk_tgt_ops *ops);
/* return buffer index for UBLK_F_AUTO_BUF_REG */
unsigned short (*buf_index)(const struct ublk_thread *t,
const struct ublk_queue *, int tag);
};
struct ublk_tgt {
unsigned long dev_size;
unsigned int sq_depth;
unsigned int cq_depth;
const struct ublk_tgt_ops *ops;
struct ublk_params params;
int nr_backing_files;
unsigned long backing_file_size[MAX_BACK_FILES];
char backing_file[MAX_BACK_FILES][PATH_MAX];
};
struct ublk_queue {
int q_id;
int q_depth;
struct ublk_dev *dev;
const struct ublk_tgt_ops *tgt_ops;
struct ublksrv_io_desc *io_cmd_buf;
/* borrow three bit of ublk uapi flags, which may never be used */
#define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63)
#define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62)
#define UBLKS_Q_PREPARED (1ULL << 61)
__u64 flags;
int ublk_fd; /* cached ublk char device fd */
__u8 metadata_size;
struct ublk_io ios[UBLK_QUEUE_DEPTH];
/* used for prep io commands */
pthread_spinlock_t lock;
};
/* align with `ublk_elem_header` */
struct ublk_batch_elem {
__u16 tag;
__u16 buf_index;
__s32 result;
__u64 buf_addr;
};
struct batch_commit_buf {
unsigned short q_id;
unsigned short buf_idx;
void *elem;
unsigned short done;
unsigned short count;
};
struct batch_fetch_buf {
struct io_uring_buf_ring *br;
void *fetch_buf;
unsigned int fetch_buf_size;
unsigned int fetch_buf_off;
};
struct ublk_thread {
/* Thread-local copy of queue-to-thread mapping for this thread */
unsigned char q_map[UBLK_MAX_QUEUES];
struct ublk_dev *dev;
unsigned short idx;
unsigned short nr_queues;
#define UBLKS_T_STOPPING (1U << 0)
#define UBLKS_T_IDLE (1U << 1)
#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */
unsigned state;
unsigned int cmd_inflight;
unsigned int io_inflight;
unsigned short nr_bufs;
/* followings are for BATCH_IO */
unsigned short commit_buf_start;
unsigned char commit_buf_elem_size;
/*
* We just support single device, so pre-calculate commit/prep flags
*/
unsigned short cmd_flags;
unsigned int nr_commit_buf;
unsigned int commit_buf_size;
void *commit_buf;
#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1)
struct allocator commit_buf_alloc;
struct batch_commit_buf *commit;
/* FETCH_IO_CMDS buffer */
unsigned short nr_fetch_bufs;
struct batch_fetch_buf *fetch;
struct io_uring ring;
};
struct ublk_dev {
struct ublk_tgt tgt;
struct ublksrv_ctrl_dev_info dev_info;
struct ublk_queue q[UBLK_MAX_QUEUES];
unsigned nthreads;
unsigned per_io_tasks;
int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */
int nr_fds;
int ctrl_fd;
struct io_uring ring;
void *private_data;
};
extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io);
static inline int __ublk_use_batch_io(__u64 flags)
{
return flags & UBLK_F_BATCH_IO;
}
static inline int ublk_queue_batch_io(const struct ublk_queue *q)
{
return __ublk_use_batch_io(q->flags);
}
static inline int ublk_dev_batch_io(const struct ublk_dev *dev)
{
return __ublk_use_batch_io(dev->dev_info.flags);
}
/* only work for handle single device in this pthread context */
static inline int ublk_thread_batch_io(const struct ublk_thread *t)
{
return t->state & UBLKS_T_BATCH_IO;
}
static inline void ublk_set_integrity_params(const struct dev_ctx *ctx,
struct ublk_params *params)
{
if (!ctx->metadata_size)
return;
params->types |= UBLK_PARAM_TYPE_INTEGRITY;
params->integrity = (struct ublk_param_integrity) {
.flags = ctx->integrity_flags,
.interval_exp = params->basic.logical_bs_shift,
.metadata_size = ctx->metadata_size,
.pi_offset = ctx->pi_offset,
.csum_type = ctx->csum_type,
.tag_size = ctx->tag_size,
};
}
static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len)
{
/* All targets currently use interval_exp = logical_bs_shift = 9 */
return (len >> 9) * q->metadata_size;
}
static inline size_t
ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len)
{
return (integrity_len / q->metadata_size) << 9;
}
static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
{
return !!(iod->op_flags & UBLK_IO_F_NEED_REG_BUF);
}
static inline __u64 ublk_user_copy_offset(unsigned q_id, unsigned tag)
{
return UBLKSRV_IO_BUF_OFFSET +
((__u64)q_id << UBLK_QID_OFF | (__u64)tag << UBLK_TAG_OFF);
}
static inline int is_target_io(__u64 user_data)
{
return (user_data & (1ULL << 63)) != 0;
}
static inline __u64 build_user_data(unsigned tag, unsigned op,
unsigned tgt_data, unsigned q_id, unsigned is_target_io)
{
/* we only have 7 bits to encode q_id */
_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7");
ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) |
(__u64)q_id << 56 | (__u64)is_target_io << 63;
}
static inline unsigned int user_data_to_tag(__u64 user_data)
{
return user_data & 0xffff;
}
static inline unsigned int user_data_to_op(__u64 user_data)
{
return (user_data >> 16) & 0xff;
}
static inline unsigned int user_data_to_tgt_data(__u64 user_data)
{
return (user_data >> 24) & 0xffff;
}
static inline unsigned int user_data_to_q_id(__u64 user_data)
{
return (user_data >> 56) & 0x7f;
}
static inline unsigned short ublk_cmd_op_nr(unsigned int op)
{
return _IOC_NR(op);
}
static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io)
{
return container_of(io, struct ublk_queue, ios[io->tag]);
}
static inline int ublk_io_alloc_sqes(struct ublk_thread *t,
struct io_uring_sqe *sqes[], int nr_sqes)
{
struct io_uring *ring = &t->ring;
unsigned left = io_uring_sq_space_left(ring);
int i;
if (left < nr_sqes)
io_uring_submit(ring);
for (i = 0; i < nr_sqes; i++) {
sqes[i] = io_uring_get_sqe(ring);
if (!sqes[i])
return i;
}
return nr_sqes;
}
static inline int ublk_get_registered_fd(struct ublk_queue *q, int fd_index)
{
if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
if (fd_index == 0)
/* Return the raw ublk FD for index 0 */
return q->ublk_fd;
/* Adjust index for backing files (index 1 becomes 0, etc.) */
return fd_index - 1;
}
return fd_index;
}
static inline void __io_uring_prep_buf_reg_unreg(struct io_uring_sqe *sqe,
struct ublk_queue *q, int tag, int q_id, __u64 index)
{
struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
int dev_fd = ublk_get_registered_fd(q, 0);
io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
sqe->opcode = IORING_OP_URING_CMD;
if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
sqe->flags &= ~IOSQE_FIXED_FILE;
else
sqe->flags |= IOSQE_FIXED_FILE;
cmd->tag = tag;
cmd->addr = index;
cmd->q_id = q_id;
}
static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe,
struct ublk_queue *q, int tag, int q_id, __u64 index)
{
__io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index);
sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF;
}
static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe,
struct ublk_queue *q, int tag, int q_id, __u64 index)
{
__io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index);
sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF;
}
static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe)
{
return (void *)&sqe->cmd;
}
static inline void ublk_set_io_res(struct ublk_queue *q, int tag, int res)
{
q->ios[tag].result = res;
}
static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag)
{
return q->ios[tag].result;
}
static inline void ublk_mark_io_done(struct ublk_io *io, int res)
{
io->flags |= (UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_FREE);
io->result = res;
}
static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag)
{
return &q->io_cmd_buf[tag];
}
static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op)
{
__u32 *addr = (__u32 *)&sqe->off;
addr[0] = cmd_op;
addr[1] = 0;
}
static inline unsigned short ublk_batch_io_buf_idx(
const struct ublk_thread *t, const struct ublk_queue *q,
unsigned tag);
static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t,
const struct ublk_queue *q,
unsigned tag)
{
if (ublk_queue_batch_io(q))
return ublk_batch_io_buf_idx(t, q, tag);
return q->ios[tag].buf_index;
}
static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag)
{
return &q->ios[tag];
}
static inline int ublk_completed_tgt_io(struct ublk_thread *t,
struct ublk_queue *q, unsigned tag)
{
struct ublk_io *io = ublk_get_io(q, tag);
t->io_inflight--;
return --io->tgt_ios == 0;
}
static inline bool ublk_queue_use_zc(const struct ublk_queue *q)
{
return !!(q->flags & UBLK_F_SUPPORT_ZERO_COPY);
}
static inline bool ublk_queue_use_auto_zc(const struct ublk_queue *q)
{
return !!(q->flags & UBLK_F_AUTO_BUF_REG);
}
static inline bool ublk_queue_auto_zc_fallback(const struct ublk_queue *q)
{
return !!(q->flags & UBLKS_Q_AUTO_BUF_REG_FALLBACK);
}
static inline bool ublk_queue_use_user_copy(const struct ublk_queue *q)
{
return !!(q->flags & UBLK_F_USER_COPY);
}
static inline int ublk_queue_no_buf(const struct ublk_queue *q)
{
return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q);
}
static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb)
{
return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX;
}
static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t,
const struct ublk_queue *q)
{
unsigned char idx;
idx = t->q_map[q->q_id];
ublk_assert(idx != 0);
return idx - 1;
}
/*
* Each IO's buffer index has to be calculated by this helper for
* UBLKS_T_BATCH_IO
*/
static inline unsigned short ublk_batch_io_buf_idx(
const struct ublk_thread *t, const struct ublk_queue *q,
unsigned tag)
{
return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag;
}
/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */
int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q);
/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */
void ublk_batch_start_fetch(struct ublk_thread *t);
/* Handle completion of batch I/O commands (prep/commit) */
void ublk_batch_compl_cmd(struct ublk_thread *t,
const struct io_uring_cqe *cqe);
/* Initialize batch I/O state and calculate buffer parameters */
void ublk_batch_prepare(struct ublk_thread *t);
/* Allocate and register commit buffers for batch operations */
int ublk_batch_alloc_buf(struct ublk_thread *t);
/* Free commit buffers and cleanup batch allocator */
void ublk_batch_free_buf(struct ublk_thread *t);
/* Prepare a new commit buffer for batching completed I/O operations */
void ublk_batch_prep_commit(struct ublk_thread *t);
/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */
void ublk_batch_commit_io_cmds(struct ublk_thread *t);
/* Add a completed I/O operation to the current batch commit buffer */
void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
unsigned tag, int res);
void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES],
int nthreads, int queues);
static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q,
unsigned tag, int res)
{
if (ublk_queue_batch_io(q)) {
ublk_batch_complete_io(t, q, tag, res);
return 0;
} else {
struct ublk_io *io = &q->ios[tag];
ublk_mark_io_done(io, res);
return ublk_queue_io_cmd(t, io);
}
}
static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q,
unsigned tag, int queued)
{
if (queued < 0)
ublk_complete_io(t, q, tag, queued);
else {
struct ublk_io *io = ublk_get_io(q, tag);
t->io_inflight += queued;
io->tgt_ios = queued;
io->result = 0;
}
}
extern const struct ublk_tgt_ops null_tgt_ops;
extern const struct ublk_tgt_ops loop_tgt_ops;
extern const struct ublk_tgt_ops stripe_tgt_ops;
extern const struct ublk_tgt_ops fault_inject_tgt_ops;
void backing_file_tgt_deinit(struct ublk_dev *dev);
int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct);
#endif