diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 09a9d4aca0fd..900b3fc4c72d 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -886,6 +886,21 @@ Description: zone commands, they will be treated as regular block devices and zoned will report "none". +What: /sys/block//queue/zoned_qd1_writes +Date: January 2026 +Contact: Damien Le Moal +Description: + [RW] zoned_qd1_writes indicates if write operations to a zoned + block device are being handled using a single issuer context (a + kernel thread) operating at a maximum queue depth of 1. This + attribute is visible only for zoned block devices. The default + value for zoned block devices that are not rotational devices + (e.g. ZNS SSDs or zoned UFS devices) is 0. For rotational zoned + block devices (e.g. SMR HDDs) the default value is 1. Since + this default may not be appropriate for some devices, e.g. + remotely connected devices over high latency networks, the user + can disable this feature by setting this attribute to 0. + What: /sys/block//hidden Date: March 2023 diff --git a/Documentation/ABI/testing/sysfs-nvme b/Documentation/ABI/testing/sysfs-nvme new file mode 100644 index 000000000000..499d5f843cd4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-nvme @@ -0,0 +1,13 @@ +What: /sys/devices/virtual/nvme-fabrics/ctl/.../tls_configured_key +Date: November 2025 +KernelVersion: 6.19 +Contact: Linux NVMe mailing list +Description: + The file is avaliable when using a secure concatanation + connection to a NVMe target. Reading the file will return + the serial of the currently negotiated key. + + Writing 0 to the file will trigger a PSK reauthentication + (REPLACETLSPSK) with the target. After a reauthentication + the value returned by tls_configured_key will be the new + serial. diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst index 6aa865424ac3..f4f1f3121bf9 100644 --- a/Documentation/admin-guide/blockdev/zoned_loop.rst +++ b/Documentation/admin-guide/blockdev/zoned_loop.rst @@ -62,7 +62,7 @@ The options available for the add command can be listed by reading the /dev/zloop-control device:: $ cat /dev/zloop-control - add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io + add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,max_open_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io,zone_append=%u,ordered_zone_append,discard_write_cache remove id=%d In more details, the options that can be used with the "add" command are as @@ -80,6 +80,9 @@ zone_capacity_mb Device zone capacity (must always be equal to or lower conv_zones Total number of conventioanl zones starting from sector 0 Default: 8 +max_open_zones Maximum number of open sequential write required zones + (0 for no limit). + Default: 0 base_dir Path to the base directory where to create the directory containing the zone files of the device. Default=/var/local/zloop. @@ -104,6 +107,11 @@ ordered_zone_append Enable zloop mitigation of zone append reordering. (extents), as when enabled, this can significantly reduce the number of data extents needed to for a file data mapping. +discard_write_cache Discard all data that was not explicitly persisted using a + flush operation when the device is removed by truncating + each zone file to the size recorded during the last flush + operation. This simulates power fail events where + uncommitted data is lost. =================== ========================================================= 3) Deleting a Zoned Device diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 7e0703a12dfb..cae23949a626 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -153,7 +153,7 @@ blk-crypto-fallback completes the original bio. If the original bio is too large, multiple bounce bios may be required; see the code for details. For decryption, blk-crypto-fallback "wraps" the bio's completion callback -(``bi_complete``) and private data (``bi_private``) with its own, unsets the +(``bi_end_io``) and private data (``bi_private``) with its own, unsets the bio's encryption context, then submits the bio. If the read completes successfully, blk-crypto-fallback restores the bio's original completion callback and private data, then decrypts the bio's data in-place using the diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 6ad28039663d..0413dcd9ef69 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -485,6 +485,125 @@ Limitations in case that too many ublk devices are handled by this single io_ring_ctx and each one has very large queue depth +Shared Memory Zero Copy (UBLK_F_SHMEM_ZC) +------------------------------------------ + +The ``UBLK_F_SHMEM_ZC`` feature provides an alternative zero-copy path +that works by sharing physical memory pages between the client application +and the ublk server. Unlike the io_uring fixed buffer approach above, +shared memory zero copy does not require io_uring buffer registration +per I/O — instead, it relies on the kernel matching physical pages +at I/O time. This allows the ublk server to access the shared +buffer directly, which is unlikely for the io_uring fixed buffer +approach. + +Motivation +~~~~~~~~~~ + +Shared memory zero copy takes a different approach: if the client +application and the ublk server both map the same physical memory, there is +nothing to copy. The kernel detects the shared pages automatically and +tells the server where the data already lives. + +``UBLK_F_SHMEM_ZC`` can be thought of as a supplement for optimized client +applications — when the client is willing to allocate I/O buffers from +shared memory, the entire data path becomes zero-copy. + +Use Cases +~~~~~~~~~ + +This feature is useful when the client application can be configured to +use a specific shared memory region for its I/O buffers: + +- **Custom storage clients** that allocate I/O buffers from shared memory + (memfd, hugetlbfs) and issue direct I/O to the ublk device +- **Database engines** that use pre-allocated buffer pools with O_DIRECT + +How It Works +~~~~~~~~~~~~ + +1. The ublk server and client both ``mmap()`` the same file (memfd or + hugetlbfs) with ``MAP_SHARED``. This gives both processes access to the + same physical pages. + +2. The ublk server registers its mapping with the kernel:: + + struct ublk_shmem_buf_reg buf = { .addr = mmap_va, .len = size }; + ublk_ctrl_cmd(UBLK_U_CMD_REG_BUF, .addr = &buf); + + The kernel pins the pages and builds a PFN lookup tree. + +3. When the client issues direct I/O (``O_DIRECT``) to ``/dev/ublkb*``, + the kernel checks whether the I/O buffer pages match any registered + pages by comparing PFNs. + +4. On a match, the kernel sets ``UBLK_IO_F_SHMEM_ZC`` in the I/O + descriptor and encodes the buffer index and offset in ``addr``:: + + if (iod->op_flags & UBLK_IO_F_SHMEM_ZC) { + /* Data is already in our shared mapping — zero copy */ + index = ublk_shmem_zc_index(iod->addr); + offset = ublk_shmem_zc_offset(iod->addr); + buf = shmem_table[index].mmap_base + offset; + } + +5. If pages do not match (e.g., the client used a non-shared buffer), + the I/O falls back to the normal copy path silently. + +The shared memory can be set up via two methods: + +- **Socket-based**: the client sends a memfd to the ublk server via + ``SCM_RIGHTS`` on a unix socket. The server mmaps and registers it. +- **Hugetlbfs-based**: both processes ``mmap(MAP_SHARED)`` the same + hugetlbfs file. No IPC needed — same file gives same physical pages. + +Advantages +~~~~~~~~~~ + +- **Simple**: no per-I/O buffer registration or unregistration commands. + Once the shared buffer is registered, all matching I/O is zero-copy + automatically. +- **Direct buffer access**: the ublk server can read and write the shared + buffer directly via its own mmap, without going through io_uring fixed + buffer operations. This is more friendly for server implementations. +- **Fast**: PFN matching is a single maple tree lookup per bvec. No + io_uring command round-trips for buffer management. +- **Compatible**: non-matching I/O silently falls back to the copy path. + The device works normally for any client, with zero-copy as an + optimization when shared memory is available. + +Limitations +~~~~~~~~~~~ + +- **Requires client cooperation**: the client must allocate its I/O + buffers from the shared memory region. This requires a custom or + configured client — standard applications using their own buffers + will not benefit. +- **Direct I/O only**: buffered I/O (without ``O_DIRECT``) goes through + the page cache, which allocates its own pages. These kernel-allocated + pages will never match the registered shared buffer. Only ``O_DIRECT`` + puts the client's buffer pages directly into the block I/O. +- **Contiguous data only**: each I/O request's data must be contiguous + within a single registered buffer. Scatter/gather I/O that spans + multiple non-adjacent registered buffers cannot use the zero-copy path. + +Control Commands +~~~~~~~~~~~~~~~~ + +- ``UBLK_U_CMD_REG_BUF`` + + Register a shared memory buffer. ``ctrl_cmd.addr`` points to a + ``struct ublk_shmem_buf_reg`` containing the buffer virtual address and size. + Returns the assigned buffer index (>= 0) on success. The kernel pins + pages and builds the PFN lookup tree. Queue freeze is handled + internally. + +- ``UBLK_U_CMD_UNREG_BUF`` + + Unregister a previously registered buffer. ``ctrl_cmd.data[0]`` is the + buffer index. Unpins pages and removes PFN entries from the lookup + tree. + References ========== diff --git a/MAINTAINERS b/MAINTAINERS index cff10617372e..047c9ba52651 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27015,7 +27015,7 @@ F: Documentation/filesystems/ubifs.rst F: fs/ubifs/ UBLK USERSPACE BLOCK DRIVER -M: Ming Lei +M: Ming Lei L: linux-block@vger.kernel.org S: Maintained F: Documentation/block/ublk.rst diff --git a/block/bio.c b/block/bio.c index 784d2a66d3ae..641ef0928d73 100644 --- a/block/bio.c +++ b/block/bio.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "blk.h" @@ -34,6 +35,8 @@ struct bio_alloc_cache { unsigned int nr_irq; }; +#define BIO_INLINE_VECS 4 + static struct biovec_slab { int nr_vecs; char *name; @@ -114,6 +117,11 @@ static inline unsigned int bs_bio_slab_size(struct bio_set *bs) return bs->front_pad + sizeof(struct bio) + bs->back_pad; } +static inline void *bio_slab_addr(struct bio *bio) +{ + return (void *)bio - bio->bi_pool->front_pad; +} + static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) { unsigned int size = bs_bio_slab_size(bs); @@ -159,57 +167,16 @@ out: mutex_unlock(&bio_slab_lock); } -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) -{ - BUG_ON(nr_vecs > BIO_MAX_VECS); - - if (nr_vecs == BIO_MAX_VECS) - mempool_free(bv, pool); - else if (nr_vecs > BIO_INLINE_VECS) - kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); -} - /* * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ -static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +static inline gfp_t try_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, - gfp_t gfp_mask) -{ - struct biovec_slab *bvs = biovec_slab(*nr_vecs); - - if (WARN_ON_ONCE(!bvs)) - return NULL; - - /* - * Upgrade the nr_vecs request to take full advantage of the allocation. - * We also rely on this in the bvec_free path. - */ - *nr_vecs = bvs->nr_vecs; - - /* - * Try a slab allocation first for all smaller allocations. If that - * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_VECS entries. - */ - if (*nr_vecs < BIO_MAX_VECS) { - struct bio_vec *bvl; - - bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) - return bvl; - *nr_vecs = BIO_MAX_VECS; - } - - return mempool_alloc(pool, gfp_mask); -} - void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP @@ -231,9 +198,14 @@ static void bio_free(struct bio *bio) void *p = bio; WARN_ON_ONCE(!bs); + WARN_ON_ONCE(bio->bi_max_vecs > BIO_MAX_VECS); bio_uninit(bio); - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + if (bio->bi_max_vecs == BIO_MAX_VECS) + mempool_free(bio->bi_io_vec, &bs->bvec_pool); + else if (bio->bi_max_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(bio->bi_max_vecs)->slab, + bio->bi_io_vec); mempool_free(p - bs->front_pad, &bs->bio_pool); } @@ -430,13 +402,31 @@ static void bio_alloc_rescue(struct work_struct *work) } } +/* + * submit_bio_noacct() converts recursion to iteration; this means if we're + * running beneath it, any bios we allocate and submit will not be submitted + * (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios from the + * same bio_set while running underneath submit_bio_noacct(). If we were to + * allocate multiple bios (say a stacking block driver that was splitting bios), + * we would deadlock if we exhausted the mempool's reserve. + * + * We solve this, and guarantee forward progress by punting the bios on + * current->bio_list to a per bio_set rescuer workqueue before blocking to wait + * for elements being returned to the mempool. + */ static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; - if (WARN_ON_ONCE(!bs->rescue_workqueue)) + if (!current->bio_list || !bs->rescue_workqueue) return; + if (bio_list_empty(¤t->bio_list[0]) && + bio_list_empty(¤t->bio_list[1])) + return; + /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on @@ -483,9 +473,7 @@ static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) local_irq_restore(flags); } -static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, - unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, - struct bio_set *bs) +static struct bio *bio_alloc_percpu_cache(struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; @@ -503,12 +491,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->free_list = bio->bi_next; cache->nr--; put_cpu(); - - if (nr_vecs) - bio_init_inline(bio, bdev, nr_vecs, opf); - else - bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; + + kmemleak_alloc(bio_slab_addr(bio), + kmem_cache_size(bs->bio_slab), 1, GFP_NOIO); return bio; } @@ -517,7 +503,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio - * @gfp_mask: the GFP_* mask given to the slab allocator + * @gfp: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. @@ -547,91 +533,77 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs) + blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { - gfp_t saved_gfp = gfp_mask; - struct bio *bio; + struct bio_vec *bvecs = NULL; + struct bio *bio = NULL; + gfp_t saved_gfp = gfp; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + gfp = try_alloc_gfp(gfp); if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { - opf |= REQ_ALLOC_CACHE; - bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, - gfp_mask, bs); - if (bio) - return bio; /* - * No cached bio available, bio returned below marked with - * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. + * Set REQ_ALLOC_CACHE even if no cached bio is available to + * return the allocated bio to the percpu cache when done. */ - } else - opf &= ~REQ_ALLOC_CACHE; - - /* - * submit_bio_noacct() converts recursion to iteration; this means if - * we're running beneath it, any bios we allocate and submit will not be - * submitted (and thus freed) until after we return. - * - * This exposes us to a potential deadlock if we allocate multiple bios - * from the same bio_set() while running underneath submit_bio_noacct(). - * If we were to allocate multiple bios (say a stacking block driver - * that was splitting bios), we would deadlock if we exhausted the - * mempool's reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are bios on - * current->bio_list, we first try the allocation without - * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be - * blocking to the rescuer workqueue before we retry with the original - * gfp_flags. - */ - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; - - p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - if (unlikely(!p)) - return NULL; - if (!mempool_is_saturated(&bs->bio_pool)) - opf &= ~REQ_ALLOC_CACHE; - - bio = p + bs->front_pad; - if (nr_vecs > BIO_INLINE_VECS) { - struct bio_vec *bvl = NULL; - - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); - } - if (unlikely(!bvl)) - goto err_free; - - bio_init(bio, bdev, bvl, nr_vecs, opf); - } else if (nr_vecs) { - bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_percpu_cache(bs); } else { - bio_init(bio, bdev, NULL, 0, opf); + opf &= ~REQ_ALLOC_CACHE; + p = kmem_cache_alloc(bs->bio_slab, gfp); + if (p) + bio = p + bs->front_pad; } + if (bio && nr_vecs > BIO_INLINE_VECS) { + struct biovec_slab *bvs = biovec_slab(nr_vecs); + + /* + * Upgrade nr_vecs to take full advantage of the allocation. + * We also rely on this in bio_free(). + */ + nr_vecs = bvs->nr_vecs; + bvecs = kmem_cache_alloc(bvs->slab, gfp); + if (unlikely(!bvecs)) { + kmem_cache_free(bs->bio_slab, p); + bio = NULL; + } + } + + if (unlikely(!bio)) { + /* + * Give up if we are not allow to sleep as non-blocking mempool + * allocations just go back to the slab allocation. + */ + if (!(saved_gfp & __GFP_DIRECT_RECLAIM)) + return NULL; + + punt_bios_to_rescuer(bs); + + /* + * Don't rob the mempools by returning to the per-CPU cache if + * we're tight on memory. + */ + opf &= ~REQ_ALLOC_CACHE; + + p = mempool_alloc(&bs->bio_pool, saved_gfp); + bio = p + bs->front_pad; + if (nr_vecs > BIO_INLINE_VECS) { + nr_vecs = BIO_MAX_VECS; + bvecs = mempool_alloc(&bs->bvec_pool, saved_gfp); + } + } + + if (nr_vecs && nr_vecs <= BIO_INLINE_VECS) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, bvecs, nr_vecs, opf); bio->bi_pool = bs; return bio; - -err_free: - mempool_free(p, &bs->bio_pool); - return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); @@ -765,6 +737,9 @@ static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, while ((bio = cache->free_list) != NULL) { cache->free_list = bio->bi_next; cache->nr--; + kmemleak_alloc(bio_slab_addr(bio), + kmem_cache_size(bio->bi_pool->bio_slab), + 1, GFP_KERNEL); bio_free(bio); if (++i == nr) break; @@ -828,6 +803,7 @@ static inline void bio_put_percpu_cache(struct bio *bio) bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; + kmemleak_free(bio_slab_addr(bio)); } else if (in_hardirq()) { lockdep_assert_irqs_disabled(); @@ -835,6 +811,7 @@ static inline void bio_put_percpu_cache(struct bio *bio) bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; + kmemleak_free(bio_slab_addr(bio)); } else { goto out_free; } @@ -897,10 +874,11 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) * @gfp: allocation priority * @bs: bio_set to allocate from * - * Allocate a new bio that is a clone of @bio_src. The caller owns the returned - * bio, but not the actual data it points to. - * - * The caller must ensure that the return bio is not freed before @bio_src. + * Allocate a new bio that is a clone of @bio_src. This reuses the bio_vecs + * pointed to by @bio_src->bi_io_vec, and clones the iterator pointing to + * the current position in it. The caller owns the returned bio, but not + * the bio_vecs, and must ensure the bio is freed before the memory + * pointed to by @bio_Src->bi_io_vecs. */ struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs) @@ -929,9 +907,7 @@ EXPORT_SYMBOL(bio_alloc_clone); * @gfp: allocation priority * * Initialize a new bio in caller provided memory that is a clone of @bio_src. - * The caller owns the returned bio, but not the actual data it points to. - * - * The caller must ensure that @bio_src is not freed before @bio. + * The same bio_vecs reuse and bio lifetime rules as bio_alloc_clone() apply. */ int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp) @@ -1064,6 +1040,8 @@ int bio_add_page(struct bio *bio, struct page *page, { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; + if (WARN_ON_ONCE(len == 0)) + return 0; if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) return 0; @@ -1484,11 +1462,41 @@ void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty) bio_iov_iter_unbounce_read(bio, is_error, mark_dirty); } -static void submit_bio_wait_endio(struct bio *bio) +static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); } +/** + * bio_await - call a function on a bio, and wait until it completes + * @bio: the bio which describes the I/O + * @submit: function called to submit the bio + * @priv: private data passed to @submit + * + * Wait for the bio as well as any bio chained off it after executing the + * passed in callback @submit. The wait for the bio is set up before calling + * @submit to ensure that the completion is captured. If @submit is %NULL, + * submit_bio() is used instead to submit the bio. + * + * Note: this overrides the bi_private and bi_end_io fields in the bio. + */ +void bio_await(struct bio *bio, void *priv, + void (*submit)(struct bio *bio, void *priv)) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = bio_wait_end_io; + bio->bi_opf |= REQ_SYNC; + if (submit) + submit(bio, priv); + else + submit_bio(bio); + blk_wait_io(&done); +} +EXPORT_SYMBOL_GPL(bio_await); + /** * submit_bio_wait - submit a bio, and wait until it completes * @bio: The &struct bio which describes the I/O @@ -1502,19 +1510,30 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, - bio->bi_bdev->bd_disk->lockdep_map); - - bio->bi_private = &done; - bio->bi_end_io = submit_bio_wait_endio; - bio->bi_opf |= REQ_SYNC; - submit_bio(bio); - blk_wait_io(&done); - + bio_await(bio, NULL, NULL); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); +static void bio_endio_cb(struct bio *bio, void *priv) +{ + bio_endio(bio); +} + +/* + * Submit @bio synchronously, or call bio_endio on it if the current process + * is being killed. + */ +int bio_submit_or_kill(struct bio *bio, unsigned int flags) +{ + if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) { + bio_await(bio, NULL, bio_endio_cb); + return -EINTR; + } + + return submit_bio_wait(bio); +} + /** * bdev_rw_virt - synchronously read into / write from kernel mapping * @bdev: block device to access @@ -1545,26 +1564,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, } EXPORT_SYMBOL_GPL(bdev_rw_virt); -static void bio_wait_end_io(struct bio *bio) -{ - complete(bio->bi_private); - bio_put(bio); -} - -/* - * bio_await_chain - ends @bio and waits for every chained bio to complete - */ -void bio_await_chain(struct bio *bio) -{ - DECLARE_COMPLETION_ONSTACK_MAP(done, - bio->bi_bdev->bd_disk->lockdep_map); - - bio->bi_private = &done; - bio->bi_end_io = bio_wait_end_io; - bio_endio(bio); - blk_wait_io(&done); -} - void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b70096497d38..554c87bb4a86 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -611,6 +612,8 @@ restart: q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); + + wake_up_var(&q->root_blkg); } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) @@ -1498,6 +1501,18 @@ int blkcg_init_disk(struct gendisk *disk) struct blkcg_gq *new_blkg, *blkg; bool preloaded; + /* + * If the queue is shared across disk rebind (e.g., SCSI), the + * previous disk's blkcg state is cleaned up asynchronously via + * disk_release() -> blkcg_exit_disk(). Wait for that cleanup to + * finish (indicated by root_blkg becoming NULL) before setting up + * new blkcg state. Otherwise, we may overwrite q->root_blkg while + * the old one is still alive, and radix_tree_insert() in + * blkg_create() will fail with -EEXIST because the old entries + * still occupy the same queue id slot in blkcg->blkg_tree. + */ + wait_var_event(&q->root_blkg, !READ_ONCE(q->root_blkg)); + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; @@ -2022,6 +2037,7 @@ void blkcg_maybe_throttle_current(void) return; out: rcu_read_unlock(); + put_disk(disk); } /** diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index ea7a0b85a46f..b069c418b6cc 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -18,7 +18,7 @@ struct blk_crypto_kobj { struct blk_crypto_attr { struct attribute attr; ssize_t (*show)(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page); + const struct blk_crypto_attr *attr, char *page); }; static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) @@ -26,39 +26,39 @@ static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) return container_of(kobj, struct blk_crypto_kobj, kobj)->profile; } -static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) +static const struct blk_crypto_attr *attr_to_crypto_attr(const struct attribute *attr) { - return container_of(attr, struct blk_crypto_attr, attr); + return container_of_const(attr, struct blk_crypto_attr, attr); } static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { /* Always show supported, since the file doesn't exist otherwise. */ return sysfs_emit(page, "supported\n"); } static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported); } static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { return sysfs_emit(page, "%u\n", profile->num_slots); } static ssize_t raw_keys_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { /* Always show supported, since the file doesn't exist otherwise. */ return sysfs_emit(page, "supported\n"); } #define BLK_CRYPTO_RO_ATTR(_name) \ - static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) + static const struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) BLK_CRYPTO_RO_ATTR(hw_wrapped_keys); BLK_CRYPTO_RO_ATTR(max_dun_bits); @@ -66,10 +66,10 @@ BLK_CRYPTO_RO_ATTR(num_keyslots); BLK_CRYPTO_RO_ATTR(raw_keys); static umode_t blk_crypto_is_visible(struct kobject *kobj, - struct attribute *attr, int n) + const struct attribute *attr, int n) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); - struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + const struct blk_crypto_attr *a = attr_to_crypto_attr(attr); if (a == &hw_wrapped_keys_attr && !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) @@ -81,7 +81,7 @@ static umode_t blk_crypto_is_visible(struct kobject *kobj, return 0444; } -static struct attribute *blk_crypto_attrs[] = { +static const struct attribute *const blk_crypto_attrs[] = { &hw_wrapped_keys_attr.attr, &max_dun_bits_attr.attr, &num_keyslots_attr.attr, @@ -90,8 +90,8 @@ static struct attribute *blk_crypto_attrs[] = { }; static const struct attribute_group blk_crypto_attr_group = { - .attrs = blk_crypto_attrs, - .is_visible = blk_crypto_is_visible, + .attrs_const = blk_crypto_attrs, + .is_visible_const = blk_crypto_is_visible, }; /* @@ -99,13 +99,13 @@ static const struct attribute_group blk_crypto_attr_group = { * modes, these are initialized at boot time by blk_crypto_sysfs_init(). */ static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX]; -static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; +static const struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, - struct attribute *attr, int n) + const struct attribute *attr, int n) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); - struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + const struct blk_crypto_attr *a = attr_to_crypto_attr(attr); int mode_num = a - __blk_crypto_mode_attrs; if (profile->modes_supported[mode_num]) @@ -114,7 +114,7 @@ static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, } static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { int mode_num = attr - __blk_crypto_mode_attrs; @@ -123,8 +123,8 @@ static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, static const struct attribute_group blk_crypto_modes_attr_group = { .name = "modes", - .attrs = blk_crypto_mode_attrs, - .is_visible = blk_crypto_mode_is_visible, + .attrs_const = blk_crypto_mode_attrs, + .is_visible_const = blk_crypto_mode_is_visible, }; static const struct attribute_group *blk_crypto_attr_groups[] = { @@ -137,7 +137,7 @@ static ssize_t blk_crypto_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); - struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + const struct blk_crypto_attr *a = attr_to_crypto_attr(attr); return a->show(profile, a, page); } diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index d479f5481b66..7be8b58893c9 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -30,17 +30,17 @@ struct blk_ia_range_sysfs_entry { ssize_t (*show)(struct blk_independent_access_range *iar, char *buf); }; -static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { +static const struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { .attr = { .name = "sector", .mode = 0444 }, .show = blk_ia_range_sector_show, }; -static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { +static const struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { .attr = { .name = "nr_sectors", .mode = 0444 }, .show = blk_ia_range_nr_sectors_show, }; -static struct attribute *blk_ia_range_attrs[] = { +static const struct attribute *const blk_ia_range_attrs[] = { &blk_ia_range_sector_entry.attr, &blk_ia_range_nr_sectors_entry.attr, NULL, diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d145db61e5c3..0cca88a366dc 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) +static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p, + u32 *nr_done) { u32 nr_met[2] = { }; u32 nr_missed[2] = { }; @@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, ioc->period_us * NSEC_PER_USEC); + + *nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE]; } /* was iocg idle this period? */ @@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer) u64 usage_us_sum = 0; u32 ppm_rthr; u32 ppm_wthr; - u32 missed_ppm[2], rq_wait_pct; + u32 missed_ppm[2], rq_wait_pct, nr_done; u64 period_vtime; int prev_busy_level; /* how were the latencies during the period? */ - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done); /* take care of active iocgs */ spin_lock_irq(&ioc->lock); @@ -2397,9 +2400,17 @@ static void ioc_timer_fn(struct timer_list *timer) * and should increase vtime rate. */ prev_busy_level = ioc->busy_level; - if (rq_wait_pct > RQ_WAIT_BUSY_PCT || - missed_ppm[READ] > ppm_rthr || - missed_ppm[WRITE] > ppm_wthr) { + if (!nr_done && nr_lagging) { + /* + * When there are lagging IOs but no completions, we don't + * know if the IO latency will meet the QoS targets. The + * disk might be saturated or not. We should not reset + * busy_level to 0 (which would prevent vrate from scaling + * up or down), but rather to keep it unchanged. + */ + } else if (rq_wait_pct > RQ_WAIT_BUSY_PCT || + missed_ppm[READ] > ppm_rthr || + missed_ppm[WRITE] > ppm_wthr) { /* clearly missing QoS targets, slow down vrate */ ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; diff --git a/block/blk-lib.c b/block/blk-lib.c index 3213afc7f0d5..688bc67cbf73 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -155,13 +155,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags, limit); if (bio) { - if ((flags & BLKDEV_ZERO_KILLABLE) && - fatal_signal_pending(current)) { - bio_await_chain(bio); - blk_finish_plug(&plug); - return -EINTR; - } - ret = submit_bio_wait(bio); + ret = bio_submit_or_kill(bio, flags); bio_put(bio); } blk_finish_plug(&plug); @@ -236,13 +230,7 @@ static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, blk_start_plug(&plug); __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags); if (bio) { - if ((flags & BLKDEV_ZERO_KILLABLE) && - fatal_signal_pending(current)) { - bio_await_chain(bio); - blk_finish_plug(&plug); - return -EINTR; - } - ret = submit_bio_wait(bio); + ret = bio_submit_or_kill(bio, flags); bio_put(bio); } blk_finish_plug(&plug); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..047ec887456b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), + QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 58ec293373c6..895397831ecc 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -53,7 +53,7 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct request_queue *q; ssize_t res; - entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + entry = container_of_const(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); q = hctx->queue; @@ -101,20 +101,20 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return pos + ret; } -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { +static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { .attr = {.name = "nr_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_tags_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { +static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { .attr = {.name = "nr_reserved_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { +static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = 0444 }, .show = blk_mq_hw_sysfs_cpus_show, }; -static struct attribute *default_hw_ctx_attrs[] = { +static const struct attribute *const default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_nr_tags.attr, &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, diff --git a/block/blk-mq.c b/block/blk-mq.c index a047faf3b0ec..4c5c16cce4f8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3424,6 +3424,25 @@ EXPORT_SYMBOL_GPL(blk_rq_prep_clone); */ void blk_steal_bios(struct bio_list *list, struct request *rq) { + struct bio *bio; + + for (bio = rq->bio; bio; bio = bio->bi_next) { + if (bio->bi_opf & REQ_POLLED) { + bio->bi_opf &= ~REQ_POLLED; + bio->bi_cookie = BLK_QC_T_NONE; + } + /* + * The alternate request queue that we may end up submitting + * the bio to may be frozen temporarily, in this case REQ_NOWAIT + * will fail the I/O immediately with EAGAIN to the issuer. + * We are not in the issuer context which cannot block. Clear + * the flag to avoid spurious EAGAIN I/O failures. + */ + bio->bi_opf &= ~REQ_NOWAIT; + bio_clear_flag(bio, BIO_QOS_THROTTLED); + bio_clear_flag(bio, BIO_QOS_MERGED); + } + if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; diff --git a/block/blk-settings.c b/block/blk-settings.c index dabfab97fbab..78c83817b9d3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -189,11 +189,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) } /* - * The PI generation / validation helpers do not expect intervals to - * straddle multiple bio_vecs. Enforce alignment so that those are + * Some IO controllers can not handle data intervals straddling + * multiple bio_vecs. For those, enforce alignment so that those are * never generated, and that each buffer is aligned as expected. */ - if (bi->csum_type) { + if (!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE) && bi->csum_type) { lim->dma_alignment = max(lim->dma_alignment, (1U << bi->interval_exp) - 1); } @@ -992,10 +992,14 @@ bool queue_limits_stack_integrity(struct queue_limits *t, if ((ti->flags & BLK_INTEGRITY_REF_TAG) != (bi->flags & BLK_INTEGRITY_REF_TAG)) goto incompatible; + if ((ti->flags & BLK_SPLIT_INTERVAL_CAPABLE) && + !(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE)) + ti->flags &= ~BLK_SPLIT_INTERVAL_CAPABLE; } else { ti->flags = BLK_INTEGRITY_STACKED; ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | - (bi->flags & BLK_INTEGRITY_REF_TAG); + (bi->flags & BLK_INTEGRITY_REF_TAG) | + (bi->flags & BLK_SPLIT_INTERVAL_CAPABLE); ti->csum_type = bi->csum_type; ti->pi_tuple_size = bi->pi_tuple_size; ti->metadata_size = bi->metadata_size; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 55a1bbfef7d4..f22c1f253eb3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) return queue_var_show(disk_nr_zones(disk), page); } +static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page) +{ + return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue), + page); +} + +static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk, + const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned long qd1_writes; + unsigned int memflags; + ssize_t ret; + + ret = queue_var_store(&qd1_writes, page, count); + if (ret < 0) + return ret; + + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + if (qd1_writes) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + + return count; +} + static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); @@ -551,27 +581,27 @@ static int queue_wc_store(struct gendisk *disk, const char *page, return 0; } -#define QUEUE_RO_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ - .attr = { .name = _name, .mode = 0444 }, \ - .show = _prefix##_show, \ +#define QUEUE_RO_ENTRY(_prefix, _name) \ +static const struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0444 }, \ + .show = _prefix##_show, \ }; -#define QUEUE_RW_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ - .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ - .store = _prefix##_store, \ +#define QUEUE_RW_ENTRY(_prefix, _name) \ +static const struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show = _prefix##_show, \ + .store = _prefix##_store, \ }; #define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ +static const struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show_limit = _prefix##_show, \ } #define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ +static const struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show_limit = _prefix##_show, \ .store_limit = _prefix##_store, \ @@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); @@ -634,7 +665,7 @@ QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ -static struct queue_sysfs_entry queue_hw_sector_size_entry = { +static const struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, .show_limit = queue_logical_block_size_show, }; @@ -700,7 +731,7 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif /* Common attributes for bio-based and request-based queues. */ -static struct attribute *queue_attrs[] = { +static const struct attribute *const queue_attrs[] = { /* * Attributes which are protected with q->limits_lock. */ @@ -754,12 +785,13 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, + &queue_zoned_qd1_writes_entry.attr, NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ -static struct attribute *blk_mq_queue_attrs[] = { +static const struct attribute *const blk_mq_queue_attrs[] = { /* * Attributes which require some form of locking other than * q->sysfs_lock. @@ -779,14 +811,15 @@ static struct attribute *blk_mq_queue_attrs[] = { NULL, }; -static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, +static umode_t queue_attr_visible(struct kobject *kobj, const struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || - attr == &queue_max_active_zones_entry.attr) && + attr == &queue_max_active_zones_entry.attr || + attr == &queue_zoned_qd1_writes_entry.attr) && !blk_queue_is_zoned(q)) return 0; @@ -794,7 +827,7 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, } static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, - struct attribute *attr, int n) + const struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; @@ -808,17 +841,17 @@ static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, return attr->mode; } -static struct attribute_group queue_attr_group = { - .attrs = queue_attrs, - .is_visible = queue_attr_visible, +static const struct attribute_group queue_attr_group = { + .attrs_const = queue_attrs, + .is_visible_const = queue_attr_visible, }; -static struct attribute_group blk_mq_queue_attr_group = { - .attrs = blk_mq_queue_attrs, - .is_visible = blk_mq_queue_attr_visible, +static const struct attribute_group blk_mq_queue_attr_group = { + .attrs_const = blk_mq_queue_attrs, + .is_visible_const = blk_mq_queue_attr_visible, }; -#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) +#define to_queue(atr) container_of_const((atr), struct queue_sysfs_entry, attr) static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) @@ -934,6 +967,14 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); blk_debugfs_unlock(q, memflags); + /* + * For blk-mq rotational zoned devices, default to using QD=1 + * writes. For non-mq rotational zoned devices, the device driver can + * set an appropriate default. + */ + if (queue_is_mq(q) && blk_queue_rot(q) && blk_queue_is_zoned(q)) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + ret = disk_register_independent_access_ranges(disk); if (ret) goto out_debugfs_remove; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 33006edfccd4..dcc2438ca16d 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -782,10 +782,11 @@ void wbt_init_enable_default(struct gendisk *disk) return; rwb = wbt_alloc(); - if (WARN_ON_ONCE(!rwb)) + if (!rwb) return; - if (WARN_ON_ONCE(wbt_init(disk, rwb))) { + if (wbt_init(disk, rwb)) { + pr_warn("%s: failed to enable wbt\n", disk->disk_name); wbt_free(rwb); return; } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 9d1dd6ccfad7..30cad2bb9291 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @entry: list_head structure for listing the plug in the disk list of active + * zone write plugs. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. @@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; + struct list_head entry; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; @@ -99,17 +104,17 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) * being executed or the zone write plug bio list is not empty. * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone * write pointer offset and need to update it. - * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed - * from the disk hash table and that the initial reference to the zone - * write plug set when the plug was first added to the hash table has been - * dropped. This flag is set when a zone is reset, finished or become full, - * to prevent new references to the zone write plug to be taken for - * newly incoming BIOs. A zone write plug flagged with this flag will be - * freed once all remaining references from BIOs or functions are dropped. + * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be + * removed from the disk hash table of zone write plugs when the last + * reference on the zone write plug is dropped. If set, this flag also + * indicates that the initial extra reference on the zone write plug was + * dropped, meaning that the reference count indicates the current number of + * active users (code context or BIOs and requests in flight). This flag is + * set when a zone is reset, finished or becomes full. */ #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) -#define BLK_ZONE_WPLUG_UNHASHED (1U << 2) +#define BLK_ZONE_WPLUG_DEAD (1U << 2) /** * blk_zone_cond_str - Return a zone condition name string @@ -412,20 +417,32 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, return 0; } -static int blkdev_truncate_zone_range(struct block_device *bdev, - blk_mode_t mode, const struct blk_zone_range *zrange) +static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode, + struct blk_zone_range *zrange) { loff_t start, end; + int ret = -EINVAL; + inode_lock(bdev->bd_mapping->host); + filemap_invalidate_lock(bdev->bd_mapping); if (zrange->sector + zrange->nr_sectors <= zrange->sector || zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) /* Out of range */ - return -EINVAL; + goto out_unlock; start = zrange->sector << SECTOR_SHIFT; end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; - return truncate_bdev_range(bdev, mode, start, end); + ret = truncate_bdev_range(bdev, mode, start, end); + if (ret) + goto out_unlock; + + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector, + zrange->nr_sectors); +out_unlock: + filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); + return ret; } /* @@ -438,7 +455,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, void __user *argp = (void __user *)arg; struct blk_zone_range zrange; enum req_op op; - int ret; if (!argp) return -EINVAL; @@ -454,15 +470,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, switch (cmd) { case BLKRESETZONE: - op = REQ_OP_ZONE_RESET; - - /* Invalidate the page cache, including dirty pages. */ - inode_lock(bdev->bd_mapping->host); - filemap_invalidate_lock(bdev->bd_mapping); - ret = blkdev_truncate_zone_range(bdev, mode, &zrange); - if (ret) - goto fail; - break; + return blkdev_reset_zone(bdev, mode, &zrange); case BLKOPENZONE: op = REQ_OP_ZONE_OPEN; break; @@ -476,15 +484,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } - ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); - -fail: - if (cmd == BLKRESETZONE) { - filemap_invalidate_unlock(bdev->bd_mapping); - inode_unlock(bdev->bd_mapping->host); - } - - return ret; + return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); } static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) @@ -492,18 +492,12 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } -static bool disk_zone_is_full(struct gendisk *disk, - unsigned int zno, unsigned int offset_in_zone) -{ - if (zno < disk->nr_zones - 1) - return offset_in_zone >= disk->zone_capacity; - return offset_in_zone >= disk->last_zone_capacity; -} - static bool disk_zone_wplug_is_full(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); + if (zwplug->zone_no < disk->nr_zones - 1) + return zwplug->wp_offset >= disk->zone_capacity; + return zwplug->wp_offset >= disk->last_zone_capacity; } static bool disk_insert_zone_wplug(struct gendisk *disk, @@ -520,10 +514,11 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * are racing with other submission context, so we may already have a * zone write plug for the same zone. */ - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { if (zwplg->zone_no == zwplug->zone_no) { - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, + flags); return false; } } @@ -535,7 +530,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * necessarilly in the active condition. */ zones_cond = rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); if (zones_cond) zwplug->cond = zones_cond[zwplug->zone_no]; else @@ -543,7 +538,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); return true; } @@ -587,105 +582,76 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); } -static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) -{ - if (refcount_dec_and_test(&zwplug->ref)) { - WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); - WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); - - call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); - } -} - -static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - lockdep_assert_held(&zwplug->lock); - - /* If the zone write plug was already removed, we are done. */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) - return false; - - /* If the zone write plug is still plugged, it cannot be removed. */ - if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) - return false; - - /* - * Completions of BIOs with blk_zone_write_plug_bio_endio() may - * happen after handling a request completion with - * blk_zone_write_plug_finish_request() (e.g. with split BIOs - * that are chained). In such case, disk_zone_wplug_unplug_bio() - * should not attempt to remove the zone write plug until all BIO - * completions are seen. Check by looking at the zone write plug - * reference count, which is 2 when the plug is unused (one reference - * taken when the plug was allocated and another reference taken by the - * caller context). - */ - if (refcount_read(&zwplug->ref) > 2) - return false; - - /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); -} - -static void disk_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; unsigned long flags; - /* If the zone write plug was already removed, we have nothing to do. */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) - return; + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)); + WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); + WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - /* - * Mark the zone write plug as unhashed and drop the extra reference we - * took when the plug was inserted in the hash table. Also update the - * disk zone condition array with the current condition of the zone - * write plug. - */ - zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)), + lockdep_is_held(&disk->zone_wplugs_hash_lock)), zwplug->zone_no, zwplug->cond); hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); + + call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); +} + +static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) +{ + if (refcount_dec_and_test(&zwplug->ref)) + disk_free_zone_wplug(zwplug); +} + +/* + * Flag the zone write plug as dead and drop the initial reference we got when + * the zone write plug was added to the hash table. The zone write plug will be + * unhashed when its last reference is dropped. + */ +static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) +{ + lockdep_assert_held(&zwplug->lock); + + if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) { + zwplug->flags |= BLK_ZONE_WPLUG_DEAD; + disk_put_zone_wplug(zwplug); + } +} + +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug); + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + + disk_zone_wplug_submit_bio(zwplug->disk, zwplug); + + /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work); - /* - * Get a reference on the write plug for the zone containing @sector. - * If the plug does not exist, it is allocated and hashed. - * Return a pointer to the zone write plug with the plug spinlock held. + * Get a zone write plug for the zone containing @sector. + * If the plug does not exist, it is allocated and inserted in the disk hash + * table. */ -static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, - sector_t sector, gfp_t gfp_mask, - unsigned long *flags) +static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk, + sector_t sector, gfp_t gfp_mask) { unsigned int zno = disk_zone_no(disk, sector); struct blk_zone_wplug *zwplug; again: zwplug = disk_get_zone_wplug(disk, sector); - if (zwplug) { - /* - * Check that a BIO completion or a zone reset or finish - * operation has not already removed the zone write plug from - * the hash table and dropped its reference count. In such case, - * we need to get a new plug so start over from the beginning. - */ - spin_lock_irqsave(&zwplug->lock, *flags); - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { - spin_unlock_irqrestore(&zwplug->lock, *flags); - disk_put_zone_wplug(zwplug); - goto again; - } + if (zwplug) return zwplug; - } /* * Allocate and initialize a zone write plug with an extra reference @@ -704,17 +670,15 @@ again: zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + INIT_LIST_HEAD(&zwplug->entry); zwplug->disk = disk; - spin_lock_irqsave(&zwplug->lock, *flags); - /* * Insert the new zone write plug in the hash table. This can fail only * if another context already inserted a plug. Retry from the beginning * in such case. */ if (!disk_insert_zone_wplug(disk, zwplug)) { - spin_unlock_irqrestore(&zwplug->lock, *flags); mempool_free(zwplug, disk->zone_wplugs_pool); goto again; } @@ -739,6 +703,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, */ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; struct bio *bio; lockdep_assert_held(&zwplug->lock); @@ -752,6 +717,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) blk_zone_wplug_bio_io_error(zwplug, bio); zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If we are using the per disk zone write plugs worker thread, remove + * the zone write plug from the work list and drop the reference we + * took when the zone write plug was added to that list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (!list_empty(&zwplug->entry)) { + list_del_init(&zwplug->entry); + disk_put_zone_wplug(zwplug); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -788,14 +767,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, disk_zone_wplug_update_cond(disk, zwplug); disk_zone_wplug_abort(zwplug); - - /* - * The zone write plug now has no BIO plugged: remove it from the - * hash table so that it cannot be seen. The plug will be freed - * when the last reference is dropped. - */ - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); + if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) + disk_mark_zone_wplug_dead(zwplug); } static unsigned int blk_zone_wp_offset(struct blk_zone *zone) @@ -1192,19 +1165,24 @@ void blk_zone_mgmt_bio_endio(struct bio *bio) } } -static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_zone_wplug_schedule_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { lockdep_assert_held(&zwplug->lock); /* - * Take a reference on the zone write plug and schedule the submission - * of the next plugged BIO. blk_zone_wplug_bio_work() will release the - * reference we take here. + * Schedule the submission of the next plugged BIO. Taking a reference + * to the zone write plug is required as the bio_work belongs to the + * plug, and thus we must ensure that the write plug does not go away + * while the work is being scheduled but has not run yet. + * blk_zone_wplug_bio_work() will release the reference we take here, + * and we also drop this reference if the work is already scheduled. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); refcount_inc(&zwplug->ref); - queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); + if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) + disk_put_zone_wplug(zwplug); } static inline void disk_zone_wplug_add_bio(struct gendisk *disk, @@ -1241,6 +1219,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); + + /* + * If we are using the disk zone write plugs worker instead of the per + * zone write plug BIO work, add the zone write plug to the work list + * if it is not already there. Make sure to also get an extra reference + * on the zone write plug so that it does not go away until it is + * removed from the work list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (list_empty(&zwplug->entry)) { + list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); + refcount_inc(&zwplug->ref); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1438,7 +1432,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) if (bio->bi_opf & REQ_NOWAIT) gfp_mask = GFP_NOWAIT; - zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); + zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask); if (!zwplug) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); @@ -1447,6 +1441,21 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * If we got a zone write plug marked as dead, then the user is issuing + * writes to a full zone, or without synchronizing with zone reset or + * zone finish operations. In such case, fail the BIO to signal this + * invalid usage. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) { + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + bio_io_error(bio); + return true; + } + /* Indicate that this BIO is being handled using zone write plugging. */ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); @@ -1459,6 +1468,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) goto queue_bio; } + /* + * For rotational devices, we will use the gendisk zone write plugs + * work instead of the per zone write plug BIO work, so queue the BIO. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + goto queue_bio; + /* If the zone is already plugged, add the BIO to the BIO plug list. */ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) goto queue_bio; @@ -1481,7 +1497,10 @@ queue_bio: if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - disk_zone_wplug_schedule_bio_work(disk, zwplug); + if (blk_queue_zoned_qd1_writes(disk->queue)) + wake_up_process(disk->zone_wplugs_worker); + else + disk_zone_wplug_schedule_work(disk, zwplug); } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1527,7 +1546,7 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) disk->disk_name, zwplug->zone_no); disk_zone_wplug_abort(zwplug); } - disk_remove_zone_wplug(disk, zwplug); + disk_mark_zone_wplug_dead(zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); @@ -1622,21 +1641,21 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* Schedule submission of the next plugged BIO if we have one. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } - - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; - /* - * If the zone is full (it was fully written or finished, or empty - * (it was reset), remove its zone write plug from the hash table. + * For rotational devices, signal the BIO completion to the zone write + * plug work. Otherwise, schedule submission of the next plugged BIO + * if we have one. */ - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); + if (bio_list_empty(&zwplug->bio_list)) + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + if (blk_queue_zoned_qd1_writes(disk->queue)) + complete(&disk->zone_wplugs_worker_bio_done); + else if (!bio_list_empty(&zwplug->bio_list)) + disk_zone_wplug_schedule_work(disk, zwplug); + + if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) + disk_mark_zone_wplug_dead(zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1727,10 +1746,9 @@ void blk_zone_write_plug_finish_request(struct request *req) disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work) +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { - struct blk_zone_wplug *zwplug = - container_of(work, struct blk_zone_wplug, bio_work); struct block_device *bdev; unsigned long flags; struct bio *bio; @@ -1746,7 +1764,7 @@ again: if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + return false; } trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, @@ -1760,14 +1778,15 @@ again: goto again; } - bdev = bio->bi_bdev; - /* * blk-mq devices will reuse the extra reference on the request queue * usage counter we took when the BIO was plugged, but the submission * path for BIO-based devices will not do that. So drop this extra * reference here. */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + reinit_completion(&disk->zone_wplugs_worker_bio_done); + bdev = bio->bi_bdev; if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); @@ -1775,14 +1794,78 @@ again: blk_mq_submit_bio(bio); } -put_zwplug: - /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ - disk_put_zone_wplug(zwplug); + return true; +} + +static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + + spin_lock_irq(&disk->zone_wplugs_list_lock); + zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, + struct blk_zone_wplug, entry); + if (zwplug) + list_del_init(&zwplug->entry); + spin_unlock_irq(&disk->zone_wplugs_list_lock); + + return zwplug; +} + +static int disk_zone_wplugs_worker(void *data) +{ + struct gendisk *disk = data; + struct blk_zone_wplug *zwplug; + unsigned int noio_flag; + + noio_flag = memalloc_noio_save(); + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + + zwplug = disk_get_zone_wplugs_work(disk); + if (zwplug) { + /* + * Process all BIOs of this zone write plug and then + * drop the reference we took when adding the zone write + * plug to the active list. + */ + set_current_state(TASK_RUNNING); + while (disk_zone_wplug_submit_bio(disk, zwplug)) + blk_wait_io(&disk->zone_wplugs_worker_bio_done); + disk_put_zone_wplug(zwplug); + continue; + } + + /* + * Only sleep if nothing sets the state to running. Else check + * for zone write plugs work again as a newly submitted BIO + * might have added a zone write plug to the work list. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); + } else { + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + schedule(); + } + } + + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + memalloc_noio_restore(noio_flag); + + return 0; } void disk_init_zone_resources(struct gendisk *disk) { - spin_lock_init(&disk->zone_wplugs_lock); + spin_lock_init(&disk->zone_wplugs_hash_lock); + spin_lock_init(&disk->zone_wplugs_list_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_list); + init_completion(&disk->zone_wplugs_worker_bio_done); } /* @@ -1798,6 +1881,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, unsigned int pool_size) { unsigned int i; + int ret = -ENOMEM; atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = @@ -1823,8 +1907,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk, if (!disk->zone_wplugs_wq) goto destroy_pool; + disk->zone_wplugs_worker = + kthread_create(disk_zone_wplugs_worker, disk, + "%s_zwplugs_worker", disk->disk_name); + if (IS_ERR(disk->zone_wplugs_worker)) { + ret = PTR_ERR(disk->zone_wplugs_worker); + disk->zone_wplugs_worker = NULL; + goto destroy_wq; + } + wake_up_process(disk->zone_wplugs_worker); + return 0; +destroy_wq: + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; destroy_pool: mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; @@ -1832,7 +1929,7 @@ free_hash: kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; - return -ENOMEM; + return ret; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) @@ -1848,9 +1945,9 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) while (!hlist_empty(&disk->zone_wplugs_hash[i])) { zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, struct blk_zone_wplug, node); - refcount_inc(&zwplug->ref); - disk_remove_zone_wplug(disk, zwplug); - disk_put_zone_wplug(zwplug); + spin_lock_irq(&zwplug->lock); + disk_mark_zone_wplug_dead(zwplug); + spin_unlock_irq(&zwplug->lock); } } @@ -1872,16 +1969,20 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { unsigned long flags; - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); kfree_rcu_mightsleep(zones_cond); } void disk_free_zone_resources(struct gendisk *disk) { + if (disk->zone_wplugs_worker) + kthread_stop(disk->zone_wplugs_worker); + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; @@ -1910,6 +2011,7 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, { struct queue_limits *lim = &disk->queue->limits; unsigned int pool_size; + int ret = 0; args->disk = disk; args->nr_zones = @@ -1932,10 +2034,13 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); - if (!disk->zone_wplugs_hash) - return disk_alloc_zone_resources(disk, pool_size); + if (!disk->zone_wplugs_hash) { + ret = disk_alloc_zone_resources(disk, pool_size); + if (ret) + kfree(args->zones_cond); + } - return 0; + return ret; } /* @@ -1967,6 +2072,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->zone_capacity = args->zone_capacity; disk->last_zone_capacity = args->last_zone_capacity; disk_set_zones_cond_array(disk, args->zones_cond); + args->zones_cond = NULL; /* * Some devices can advertise zone resource limits that are larger than @@ -2078,7 +2184,6 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, struct gendisk *disk = args->disk; struct blk_zone_wplug *zwplug; unsigned int wp_offset; - unsigned long flags; /* * Remember the capacity of the first sequential zone and check @@ -2108,10 +2213,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, if (!wp_offset || wp_offset >= zone->capacity) return 0; - zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); + zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO); if (!zwplug) return -ENOMEM; - spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); return 0; @@ -2249,21 +2353,30 @@ int blk_revalidate_disk_zones(struct gendisk *disk) } memalloc_noio_restore(noio_flag); + if (ret <= 0) + goto free_resources; + /* * If zones where reported, make sure that the entire disk capacity * has been checked. */ - if (ret > 0 && args.sector != capacity) { + if (args.sector != capacity) { pr_warn("%s: Missing zones from sector %llu\n", disk->disk_name, args.sector); ret = -ENODEV; + goto free_resources; } - if (ret > 0) - return disk_update_zone_resources(disk, &args); + ret = disk_update_zone_resources(disk, &args); + if (ret) + goto free_resources; + return 0; + +free_resources: pr_warn("%s: failed to revalidate zones\n", disk->disk_name); + kfree(args.zones_cond); memflags = blk_mq_freeze_queue(q); disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q, memflags); diff --git a/block/blk.h b/block/blk.h index c5b2115b9ea4..ec4674cdf2ea 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,7 +55,7 @@ bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio, bool split); -void bio_await_chain(struct bio *bio); +int bio_submit_or_kill(struct bio *bio, unsigned int flags); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { @@ -108,11 +108,6 @@ static inline void blk_wait_io(struct completion *done) struct block_device *blkdev_get_no_open(dev_t dev, bool autoload); void blkdev_put_no_open(struct block_device *bdev); -#define BIO_INLINE_VECS 4 -struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, - gfp_t gfp_mask); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); - bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 20cd0ef3c394..fdb4b290ca68 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -393,7 +393,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); - bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn); + bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn, NULL); if (IS_ERR(bset->bd)) { ret = PTR_ERR(bset->bd); goto out_cleanup_queue; diff --git a/block/bsg.c b/block/bsg.c index e0af6206ed28..82aaf3cee582 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,7 @@ struct bsg_device { unsigned int timeout; unsigned int reserved_size; bsg_sg_io_fn *sg_io_fn; + bsg_uring_cmd_fn *uring_cmd_fn; }; static inline struct bsg_device *to_bsg_device(struct inode *inode) @@ -158,11 +160,38 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } } +static int bsg_check_uring_features(unsigned int issue_flags) +{ + /* BSG passthrough requires big SQE/CQE support */ + if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != + (IO_URING_F_SQE128|IO_URING_F_CQE32)) + return -EOPNOTSUPP; + return 0; +} + +static int bsg_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct bsg_device *bd = to_bsg_device(file_inode(ioucmd->file)); + bool open_for_write = ioucmd->file->f_mode & FMODE_WRITE; + struct request_queue *q = bd->queue; + int ret; + + ret = bsg_check_uring_features(issue_flags); + if (ret) + return ret; + + if (!bd->uring_cmd_fn) + return -EOPNOTSUPP; + + return bd->uring_cmd_fn(q, ioucmd, issue_flags, open_for_write); +} + static const struct file_operations bsg_fops = { .open = bsg_open, .release = bsg_release, .unlocked_ioctl = bsg_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = bsg_uring_cmd, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -187,7 +216,8 @@ void bsg_unregister_queue(struct bsg_device *bd) EXPORT_SYMBOL_GPL(bsg_unregister_queue); struct bsg_device *bsg_register_queue(struct request_queue *q, - struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn) + struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn, + bsg_uring_cmd_fn *uring_cmd_fn) { struct bsg_device *bd; int ret; @@ -199,6 +229,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, bd->reserved_size = INT_MAX; bd->queue = q; bd->sg_io_fn = sg_io_fn; + bd->uring_cmd_fn = uring_cmd_fn; ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL); if (ret < 0) { diff --git a/block/disk-events.c b/block/disk-events.c index 9f9f9f8a2d6b..074731ecc3d2 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -290,13 +290,14 @@ EXPORT_SYMBOL(disk_check_media_change); * Should be called when the media changes for @disk. Generates a uevent * and attempts to free all dentries and inodes and invalidates all block * device page cache entries in that case. + * + * Callers that need a partition re-scan should arrange for one explicitly. */ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); bdev_mark_dead(disk->part0, true); - set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); diff --git a/block/ioctl.c b/block/ioctl.c index 0b04661ac809..fc3be0549aa7 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -153,13 +153,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, nr_sects = len >> SECTOR_SHIFT; blk_start_plug(&plug); - while (1) { - if (fatal_signal_pending(current)) { - if (prev) - bio_await_chain(prev); - err = -EINTR; - goto out_unplug; - } + while (!fatal_signal_pending(current)) { bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, GFP_KERNEL); if (!bio) @@ -167,12 +161,11 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, prev = bio_chain_and_submit(prev, bio); } if (prev) { - err = submit_bio_wait(prev); + err = bio_submit_or_kill(prev, BLKDEV_ZERO_KILLABLE); if (err == -EOPNOTSUPP) err = 0; bio_put(prev); } -out_unplug: blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); diff --git a/block/opal_proto.h b/block/opal_proto.h index d247a457bf6e..7c24247aa186 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -19,6 +19,7 @@ enum { TCG_SECP_00 = 0, TCG_SECP_01, + TCG_SECP_02, }; /* @@ -125,6 +126,7 @@ enum opal_uid { OPAL_LOCKING_INFO_TABLE, OPAL_ENTERPRISE_LOCKING_INFO_TABLE, OPAL_DATASTORE, + OPAL_LOCKING_TABLE, /* C_PIN_TABLE object ID's */ OPAL_C_PIN_MSID, OPAL_C_PIN_SID, @@ -154,6 +156,7 @@ enum opal_method { OPAL_AUTHENTICATE, OPAL_RANDOM, OPAL_ERASE, + OPAL_REACTIVATE, }; enum opal_token { @@ -224,6 +227,8 @@ enum opal_lockingstate { enum opal_parameter { OPAL_SUM_SET_LIST = 0x060000, + OPAL_SUM_RANGE_POLICY = 0x060001, + OPAL_SUM_ADMIN1_PIN = 0x060002, }; enum opal_revertlsp { @@ -269,6 +274,25 @@ struct opal_header { struct opal_data_subpacket subpkt; }; +/* + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 3.3.4.7.5 STACK_RESET + */ +#define OPAL_STACK_RESET 0x0002 + +struct opal_stack_reset { + u8 extendedComID[4]; + __be32 request_code; +}; + +struct opal_stack_reset_response { + u8 extendedComID[4]; + __be32 request_code; + u8 reserved0[2]; + __be16 data_length; + __be32 response; +}; + #define FC_TPER 0x0001 #define FC_LOCKING 0x0002 #define FC_GEOMETRY 0x0003 diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index d2fc122d7426..9f7389f174d0 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -40,9 +40,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data, (le32_to_cpu(dr->disc_size) >> 9); if (name) { - strlcat(state->pp_buf, " [", PAGE_SIZE); - strlcat(state->pp_buf, name, PAGE_SIZE); - strlcat(state->pp_buf, "]", PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " [%s]", name); } put_partition(state, slot, first_sector, nr_sects); return dr; @@ -78,14 +76,14 @@ static int riscix_partition(struct parsed_partitions *state, if (!rr) return -1; - strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [RISCiX]"); if (rr->magic == RISCIX_MAGIC) { unsigned long size = nr_sects > 2 ? 2 : nr_sects; int part; - strlcat(state->pp_buf, " <", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " <"); put_partition(state, slot++, first_sect, size); for (part = 0; part < 8; part++) { @@ -94,13 +92,11 @@ static int riscix_partition(struct parsed_partitions *state, put_partition(state, slot++, le32_to_cpu(rr->part[part].start), le32_to_cpu(rr->part[part].length)); - strlcat(state->pp_buf, "(", PAGE_SIZE); - strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); - strlcat(state->pp_buf, ")", PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(%s)", rr->part[part].name); } } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); } else { put_partition(state, slot++, first_sect, nr_sects); } @@ -130,7 +126,7 @@ static int linux_partition(struct parsed_partitions *state, struct linux_part *linuxp; unsigned long size = nr_sects > 2 ? 2 : nr_sects; - strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [Linux]"); put_partition(state, slot++, first_sect, size); @@ -138,7 +134,7 @@ static int linux_partition(struct parsed_partitions *state, if (!linuxp) return -1; - strlcat(state->pp_buf, " <", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " <"); while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { if (slot == state->limit) @@ -148,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state, le32_to_cpu(linuxp->nr_sects)); linuxp ++; } - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); put_dev_sector(sect); return slot; @@ -293,7 +289,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) break; } } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } #endif @@ -366,7 +362,7 @@ int adfspart_check_ICS(struct parsed_partitions *state) return 0; } - strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [ICS]"); for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { u32 start = le32_to_cpu(p->start); @@ -400,7 +396,7 @@ int adfspart_check_ICS(struct parsed_partitions *state) } put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } #endif @@ -460,7 +456,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state) return 0; } - strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [POWERTEC]"); for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { u32 start = le32_to_cpu(p->start); @@ -471,7 +467,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state) } put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } #endif @@ -542,7 +538,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); } return i ? 1 : 0; diff --git a/block/partitions/aix.c b/block/partitions/aix.c index a886cefbefbb..29b8f4cebb63 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -173,24 +173,22 @@ int aix_partition(struct parsed_partitions *state) if (d) { struct lvm_rec *p = (struct lvm_rec *)d; u16 lvm_version = be16_to_cpu(p->version); - char tmp[64]; if (lvm_version == 1) { int pp_size_log2 = be16_to_cpu(p->pp_size); pp_bytes_size = 1 << pp_size_log2; pp_blocks_size = pp_bytes_size / 512; - snprintf(tmp, sizeof(tmp), - " AIX LVM header version %u found\n", - lvm_version); + seq_buf_printf(&state->pp_buf, + " AIX LVM header version %u found\n", + lvm_version); vgda_len = be32_to_cpu(p->vgda_len); vgda_sector = be32_to_cpu(p->vgda_psn[0]); } else { - snprintf(tmp, sizeof(tmp), - " unsupported AIX LVM version %d found\n", - lvm_version); + seq_buf_printf(&state->pp_buf, + " unsupported AIX LVM version %d found\n", + lvm_version); } - strlcat(state->pp_buf, tmp, PAGE_SIZE); put_dev_sector(sect); } if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { @@ -251,14 +249,11 @@ int aix_partition(struct parsed_partitions *state) continue; } if (lp_ix == lvip[lv_ix].pps_per_lv) { - char tmp[70]; - put_partition(state, lv_ix + 1, (i + 1 - lp_ix) * pp_blocks_size + psn_part1, lvip[lv_ix].pps_per_lv * pp_blocks_size); - snprintf(tmp, sizeof(tmp), " <%s>\n", - n[lv_ix].name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " <%s>\n", + n[lv_ix].name); lvip[lv_ix].lv_is_contiguous = 1; ret = 1; next_lp_ix = 1; diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 506921095412..8325046a14eb 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -81,13 +81,8 @@ int amiga_partition(struct parsed_partitions *state) /* blksize is blocks per 512 byte standard block */ blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; - { - char tmp[7 + 10 + 1 + 1]; - - /* Be more informative */ - snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } + /* Be more informative */ + seq_buf_printf(&state->pp_buf, " RDSK (%d)", blksize * 512); blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { @@ -179,27 +174,27 @@ int amiga_partition(struct parsed_partitions *state) { /* Be even more informative to aid mounting */ char dostype[4]; - char tmp[42]; __be32 *dt = (__be32 *)dostype; *dt = pb->pb_Environment[16]; if (dostype[3] < ' ') - snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", - dostype[0], dostype[1], - dostype[2], dostype[3] + '@' ); + seq_buf_printf(&state->pp_buf, + " (%c%c%c^%c)", + dostype[0], dostype[1], + dostype[2], + dostype[3] + '@'); else - snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", - dostype[0], dostype[1], - dostype[2], dostype[3]); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - snprintf(tmp, sizeof(tmp), "(res %d spb %d)", - be32_to_cpu(pb->pb_Environment[6]), - be32_to_cpu(pb->pb_Environment[4])); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, + " (%c%c%c%c)", + dostype[0], dostype[1], + dostype[2], dostype[3]); + seq_buf_printf(&state->pp_buf, "(res %d spb %d)", + be32_to_cpu(pb->pb_Environment[6]), + be32_to_cpu(pb->pb_Environment[4])); } res = 1; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); rdb_done: return res; diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 9655c728262a..2438d1448f38 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -70,7 +70,7 @@ int atari_partition(struct parsed_partitions *state) } pi = &rs->part[0]; - strlcat(state->pp_buf, " AHDI", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " AHDI"); for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { struct rootsector *xrs; Sector sect2; @@ -89,7 +89,7 @@ int atari_partition(struct parsed_partitions *state) #ifdef ICD_PARTS part_fmt = 1; #endif - strlcat(state->pp_buf, " XGM<", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " XGM<"); partsect = extensect = be32_to_cpu(pi->st); while (1) { xrs = read_part_sector(state, partsect, §2); @@ -128,14 +128,14 @@ int atari_partition(struct parsed_partitions *state) break; } } - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); } #ifdef ICD_PARTS if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ pi = &rs->icdpart[0]; /* sanity check: no ICD format if first partition invalid */ if (OK_id(pi->id)) { - strlcat(state->pp_buf, " ICD<", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " ICD<"); for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) @@ -144,13 +144,13 @@ int atari_partition(struct parsed_partitions *state) be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); } - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); } } #endif put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/check.h b/block/partitions/check.h index e5c1c61eb353..b0997467b61a 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include "../blk.h" /* @@ -20,7 +21,7 @@ struct parsed_partitions { int next; int limit; bool access_beyond_eod; - char *pp_buf; + struct seq_buf pp_buf; }; typedef struct { @@ -37,12 +38,9 @@ static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { if (n < p->limit) { - char tmp[1 + BDEVNAME_SIZE + 10 + 1]; - p->parts[n].from = from; p->parts[n].size = size; - snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); - strlcat(p->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&p->pp_buf, " %s%d", p->name, n); } } diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index a2b1870c3fd4..4fd52ed154b4 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -229,7 +229,6 @@ static int add_part(int slot, struct cmdline_subpart *subpart, struct parsed_partitions *state) { struct partition_meta_info *info; - char tmp[sizeof(info->volname) + 4]; if (slot >= state->limit) return 1; @@ -244,8 +243,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, strscpy(info->volname, subpart->name, sizeof(info->volname)); - snprintf(tmp, sizeof(tmp), "(%s)", info->volname); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(%s)", info->volname); state->parts[slot].has_info = true; @@ -379,7 +377,7 @@ int cmdline_partition(struct parsed_partitions *state) cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 740228750aaf..5d5332ce586b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -123,16 +124,16 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) state = allocate_partitions(hd); if (!state) return NULL; - state->pp_buf = (char *)__get_free_page(GFP_KERNEL); - if (!state->pp_buf) { + state->pp_buf.buffer = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf.buffer) { free_partitions(state); return NULL; } - state->pp_buf[0] = '\0'; + seq_buf_init(&state->pp_buf, state->pp_buf.buffer, PAGE_SIZE); state->disk = hd; strscpy(state->name, hd->disk_name); - snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + seq_buf_printf(&state->pp_buf, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); @@ -151,9 +152,9 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) } if (res > 0) { - printk(KERN_INFO "%s", state->pp_buf); + printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf)); - free_page((unsigned long)state->pp_buf); + free_page((unsigned long)state->pp_buf.buffer); return state; } if (state->access_beyond_eod) @@ -164,12 +165,12 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) if (err) res = err; if (res) { - strlcat(state->pp_buf, - " unable to read partition table\n", PAGE_SIZE); - printk(KERN_INFO "%s", state->pp_buf); + seq_buf_puts(&state->pp_buf, + " unable to read partition table\n"); + printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf)); } - free_page((unsigned long)state->pp_buf); + free_page((unsigned long)state->pp_buf.buffer); free_partitions(state); return ERR_PTR(res); } @@ -177,31 +178,31 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); + return sysfs_emit(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); + return sysfs_emit(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); + return sysfs_emit(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); + return sysfs_emit(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); + return sysfs_emit(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 75474fb3848e..9865d59093fa 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -751,6 +751,6 @@ int efi_partition(struct parsed_partitions *state) } kfree(ptes); kfree(gpt); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 9311ad5fb95d..54047e722a9d 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -173,15 +173,13 @@ static int find_vol1_partitions(struct parsed_partitions *state, { sector_t blk; int counter; - char tmp[64]; Sector sect; unsigned char *data; loff_t offset, size; struct vtoc_format1_label f1; int secperblk; - snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "VOL1/%8s:", name); /* * get start of VTOC from the disk label and then search for format1 * and format8 labels @@ -219,7 +217,7 @@ static int find_vol1_partitions(struct parsed_partitions *state, blk++; data = read_part_sector(state, blk * secperblk, §); } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); if (!data) return -1; @@ -237,11 +235,9 @@ static int find_lnx1_partitions(struct parsed_partitions *state, dasd_information2_t *info) { loff_t offset, geo_size, size; - char tmp[64]; int secperblk; - snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "LNX1/%8s:", name); secperblk = blocksize >> 9; if (label->lnx.ldl_version == 0xf2) { size = label->lnx.formatted_blocks * secperblk; @@ -258,7 +254,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, size = nr_sectors; if (size != geo_size) { if (!info) { - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } if (!strcmp(info->type, "ECKD")) @@ -270,7 +266,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, /* first and only partition starts in the first block after the label */ offset = labelsect + secperblk; put_partition(state, 1, offset, size - offset); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } @@ -282,7 +278,6 @@ static int find_cms1_partitions(struct parsed_partitions *state, sector_t labelsect) { loff_t offset, size; - char tmp[64]; int secperblk; /* @@ -291,14 +286,12 @@ static int find_cms1_partitions(struct parsed_partitions *state, blocksize = label->cms.block_size; secperblk = blocksize >> 9; if (label->cms.disk_offset != 0) { - snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "CMS1/%8s(MDSK):", name); /* disk is reserved minidisk */ offset = label->cms.disk_offset * secperblk; size = (label->cms.block_count - 1) * secperblk; } else { - snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "CMS1/%8s:", name); /* * Special case for FBA devices: * If an FBA device is CMS formatted with blocksize > 512 byte @@ -314,7 +307,7 @@ static int find_cms1_partitions(struct parsed_partitions *state, } put_partition(state, 1, offset, size-offset); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } @@ -391,11 +384,11 @@ int ibm_partition(struct parsed_partitions *state) */ res = 1; if (info->format == DASD_FORMAT_LDL) { - strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "(nonl)"); size = nr_sectors; offset = (info->label_block + 1) * (blocksize >> 9); put_partition(state, 1, offset, size-offset); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); } } else res = 0; diff --git a/block/partitions/karma.c b/block/partitions/karma.c index 4d93512f4bd4..a4e3c5050177 100644 --- a/block/partitions/karma.c +++ b/block/partitions/karma.c @@ -53,7 +53,7 @@ int karma_partition(struct parsed_partitions *state) } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 776b4ad95091..c0bdcae58a3e 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -582,7 +582,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp, return false; } - strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); + seq_buf_puts(&pp->pp_buf, " [LDM]"); /* Create the data partitions */ list_for_each (item, &ldb->v_part) { @@ -597,7 +597,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp, part_num++; } - strlcat(pp->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&pp->pp_buf, "\n"); return true; } diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b02530d98629..df03ca428e15 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -86,7 +86,7 @@ int mac_partition(struct parsed_partitions *state) if (blocks_in_map >= state->limit) blocks_in_map = state->limit - 1; - strlcat(state->pp_buf, " [mac]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [mac]"); for (slot = 1; slot <= blocks_in_map; ++slot) { int pos = slot * secsize; put_dev_sector(sect); @@ -152,6 +152,6 @@ int mac_partition(struct parsed_partitions *state) #endif put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 073be78ba0b0..200ea53ea6a2 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -263,18 +263,11 @@ static void parse_solaris_x86(struct parsed_partitions *state, put_dev_sector(sect); return; } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } + seq_buf_printf(&state->pp_buf, " %s%d: name, origin); if (le32_to_cpu(v->v_version) != 1) { - char tmp[64]; - - snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", - le32_to_cpu(v->v_version)); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, + " cannot handle version %d vtoc>\n", + le32_to_cpu(v->v_version)); put_dev_sector(sect); return; } @@ -282,12 +275,10 @@ static void parse_solaris_x86(struct parsed_partitions *state, max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; for (i = 0; i < max_nparts && state->next < state->limit; i++) { struct solaris_x86_slice *s = &v->v_slice[i]; - char tmp[3 + 10 + 1 + 1]; if (s->s_size == 0) continue; - snprintf(tmp, sizeof(tmp), " [s%d]", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " [s%d]", i); /* solaris partitions are relative to current MS-DOS * one; must add the offset of the current partition */ put_partition(state, state->next++, @@ -295,7 +286,7 @@ static void parse_solaris_x86(struct parsed_partitions *state, le32_to_cpu(s->s_size)); } put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); #endif } @@ -359,7 +350,6 @@ static void parse_bsd(struct parsed_partitions *state, Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; - char tmp[64]; l = read_part_sector(state, offset + 1, §); if (!l) @@ -369,8 +359,7 @@ static void parse_bsd(struct parsed_partitions *state, return; } - snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " %s%d: <%s:", state->name, origin, flavour); if (le16_to_cpu(l->d_npartitions) < max_partitions) max_partitions = le16_to_cpu(l->d_npartitions); @@ -391,18 +380,16 @@ static void parse_bsd(struct parsed_partitions *state, /* full parent partition, we have it already */ continue; if (offset > bsd_start || offset+size < bsd_start+bsd_size) { - strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "bad subpartition - ignored\n"); continue; } put_partition(state, state->next++, bsd_start, bsd_size); } put_dev_sector(sect); - if (le16_to_cpu(l->d_npartitions) > max_partitions) { - snprintf(tmp, sizeof(tmp), " (ignored %d more)", - le16_to_cpu(l->d_npartitions) - max_partitions); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + if (le16_to_cpu(l->d_npartitions) > max_partitions) + seq_buf_printf(&state->pp_buf, " (ignored %d more)", + le16_to_cpu(l->d_npartitions) - max_partitions); + seq_buf_puts(&state->pp_buf, " >\n"); } #endif @@ -496,12 +483,7 @@ static void parse_unixware(struct parsed_partitions *state, put_dev_sector(sect); return; } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } + seq_buf_printf(&state->pp_buf, " %s%d: name, origin); p = &l->vtoc.v_slice[1]; /* I omit the 0th slice as it is the same as whole disk. */ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { @@ -515,7 +497,7 @@ static void parse_unixware(struct parsed_partitions *state, p++; } put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); #endif } @@ -546,10 +528,7 @@ static void parse_minix(struct parsed_partitions *state, * the normal boot sector. */ if (msdos_magic_present(data + 510) && p->sys_ind == MINIX_PARTITION) { /* subpartition table present */ - char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " %s%d: name, origin); for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { if (state->next == state->limit) break; @@ -558,7 +537,7 @@ static void parse_minix(struct parsed_partitions *state, put_partition(state, state->next++, start_sect(p), nr_sects(p)); } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); } put_dev_sector(sect); #endif /* CONFIG_MINIX_SUBPARTITION */ @@ -602,7 +581,7 @@ int msdos_partition(struct parsed_partitions *state) #ifdef CONFIG_AIX_PARTITION return aix_partition(state); #else - strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [AIX]"); return 0; #endif } @@ -629,7 +608,7 @@ int msdos_partition(struct parsed_partitions *state) fb = (struct fat_boot_sector *) data; if (slot == 1 && fb->reserved && fb->fats && fat_valid_media(fb->media)) { - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } else { @@ -678,9 +657,9 @@ int msdos_partition(struct parsed_partitions *state) n = min(size, max(sector_size, n)); put_partition(state, slot, start, n); - strlcat(state->pp_buf, " <", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " <"); parse_extended(state, start, size, disksig); - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); continue; } put_partition(state, slot, start, size); @@ -688,12 +667,12 @@ int msdos_partition(struct parsed_partitions *state) if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; if (p->sys_ind == DM6_PARTITION) - strlcat(state->pp_buf, "[DM]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "[DM]"); if (p->sys_ind == EZD_PARTITION) - strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "[EZD]"); } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); /* second pass - output for each on a separate line */ p = (struct msdos_partition *) (0x1be + data); diff --git a/block/partitions/of.c b/block/partitions/of.c index 4e760fdffb3f..c22b60661098 100644 --- a/block/partitions/of.c +++ b/block/partitions/of.c @@ -36,7 +36,6 @@ static void add_of_partition(struct parsed_partitions *state, int slot, struct device_node *np) { struct partition_meta_info *info; - char tmp[sizeof(info->volname) + 4]; const char *partname; int len; @@ -63,8 +62,7 @@ static void add_of_partition(struct parsed_partitions *state, int slot, partname = of_get_property(np, "name", &len); strscpy(info->volname, partname, sizeof(info->volname)); - snprintf(tmp, sizeof(tmp), "(%s)", info->volname); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(%s)", info->volname); } int of_partition(struct parsed_partitions *state) @@ -104,7 +102,7 @@ int of_partition(struct parsed_partitions *state) slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/osf.c b/block/partitions/osf.c index 84560d0765ed..2a692584dba9 100644 --- a/block/partitions/osf.c +++ b/block/partitions/osf.c @@ -81,7 +81,7 @@ int osf_partition(struct parsed_partitions *state) le32_to_cpu(partition->p_size)); slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index b5ecddd5181a..2383ca63cd66 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -79,7 +79,7 @@ int sgi_partition(struct parsed_partitions *state) } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 2419af76120f..92c645fcd2e0 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -121,7 +121,7 @@ int sun_partition(struct parsed_partitions *state) } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c index 6f6257fd4eb4..470e0f9de7be 100644 --- a/block/partitions/sysv68.c +++ b/block/partitions/sysv68.c @@ -54,7 +54,6 @@ int sysv68_partition(struct parsed_partitions *state) unsigned char *data; struct dkblk0 *b; struct slice *slice; - char tmp[64]; data = read_part_sector(state, 0, §); if (!data) @@ -74,8 +73,7 @@ int sysv68_partition(struct parsed_partitions *state) return -1; slices -= 1; /* last slice is the whole disk */ - snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "sysV68: %s(s%u)", state->name, slices); slice = (struct slice *)data; for (i = 0; i < slices; i++, slice++) { if (slot == state->limit) @@ -84,12 +82,11 @@ int sysv68_partition(struct parsed_partitions *state) put_partition(state, slot, be32_to_cpu(slice->blkoff), be32_to_cpu(slice->nblocks)); - snprintf(tmp, sizeof(tmp), "(s%u)", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(s%u)", i); } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c index 4aaa81043ca0..b4b9ddc57a5d 100644 --- a/block/partitions/ultrix.c +++ b/block/partitions/ultrix.c @@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state) label->pt_part[i].pi_blkoff, label->pt_part[i].pi_nblocks); put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } else { put_dev_sector(sect); diff --git a/block/sed-opal.c b/block/sed-opal.c index 3ded1ca723ca..79b290d9458a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -160,6 +160,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, [OPAL_DATASTORE] = { 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_LOCKING_TABLE] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x00 }, /* C_PIN_TABLE object ID's */ [OPAL_C_PIN_MSID] = @@ -218,6 +220,8 @@ static const u8 opalmethod[][OPAL_METHOD_LENGTH] = { { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, [OPAL_ERASE] = { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, + [OPAL_REACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x01 }, }; static int end_opal_session_error(struct opal_dev *dev); @@ -1514,7 +1518,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, return err; } -static int setup_locking_range(struct opal_dev *dev, void *data) +static int setup_enable_range(struct opal_dev *dev, void *data) { u8 uid[OPAL_UID_LENGTH]; struct opal_user_lr_setup *setup = data; @@ -1528,38 +1532,47 @@ static int setup_locking_range(struct opal_dev *dev, void *data) if (lr == 0) err = enable_global_lr(dev, uid, setup); - else { - err = cmd_start(dev, uid, opalmethod[OPAL_SET]); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_VALUES); - add_token_u8(&err, dev, OPAL_STARTLIST); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_RANGESTART); - add_token_u64(&err, dev, setup->range_start); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_RANGELENGTH); - add_token_u64(&err, dev, setup->range_length); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_READLOCKENABLED); - add_token_u64(&err, dev, !!setup->RLE); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_WRITELOCKENABLED); - add_token_u64(&err, dev, !!setup->WLE); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_ENDLIST); - add_token_u8(&err, dev, OPAL_ENDNAME); - } + else + err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, 0, 0); if (err) { - pr_debug("Error building Setup Locking range command.\n"); + pr_debug("Failed to create enable lr command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int setup_locking_range_start_length(struct opal_dev *dev, void *data) +{ + int err; + u8 uid[OPAL_UID_LENGTH]; + struct opal_user_lr_setup *setup = data; + + err = build_locking_range(uid, sizeof(uid), setup->session.opal_key.lr); + if (err) + return err; + + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_RANGESTART); + add_token_u64(&err, dev, setup->range_start); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_RANGELENGTH); + add_token_u64(&err, dev, setup->range_length); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error building Setup Locking RangeStartLength command.\n"); return err; } @@ -1568,7 +1581,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data) static int response_get_column(const struct parsed_resp *resp, int *iter, - u8 column, + u64 column, u64 *value) { const struct opal_resp_tok *tok; @@ -1586,7 +1599,7 @@ static int response_get_column(const struct parsed_resp *resp, n++; if (response_get_u64(resp, n) != column) { - pr_debug("Token %d does not match expected column %u.\n", + pr_debug("Token %d does not match expected column %llu.\n", n, column); return OPAL_INVAL_PARAM; } @@ -1744,6 +1757,12 @@ static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data) OPAL_ADMINSP_UID, NULL, 0); } +static int start_anybodyLSP_opal_session(struct opal_dev *dev, void *data) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_LOCKINGSP_UID, NULL, 0); +} + static int start_SIDASP_opal_session(struct opal_dev *dev, void *data) { int ret; @@ -2285,6 +2304,74 @@ static int activate_lsp(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int reactivate_lsp(struct opal_dev *dev, void *data) +{ + struct opal_lr_react *opal_react = data; + u8 user_lr[OPAL_UID_LENGTH]; + int err, i; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REACTIVATE]); + + if (err) { + pr_debug("Error building Reactivate LockingSP command.\n"); + return err; + } + + /* + * If neither 'entire_table' nor 'num_lrs' is set, the device + * gets reactivated with SUM disabled. Only Admin1PIN will change + * if set. + */ + if (opal_react->entire_table) { + /* Entire Locking table (all locking ranges) will be put in SUM. */ + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + } else if (opal_react->num_lrs) { + /* Subset of Locking table (selected locking range(s)) to be put in SUM */ + err = build_locking_range(user_lr, sizeof(user_lr), + opal_react->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_react->num_lrs; i++) { + user_lr[7] = opal_react->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* Skipping the rangle policy parameter is same as setting its value to zero */ + if (opal_react->range_policy && (opal_react->num_lrs || opal_react->entire_table)) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_RANGE_POLICY); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* + * Optional parameter. If set, it changes the Admin1 PIN even when SUM + * is being disabled. + */ + if (opal_react->new_admin_key.key_len) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_ADMIN1_PIN); + add_token_bytestring(&err, dev, opal_react->new_admin_key.key, + opal_react->new_admin_key.key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + return finalize_and_send(dev, parse_and_check_status); +} + /* Determine if we're in the Manufactured Inactive or Active state */ static int get_lsp_lifecycle(struct opal_dev *dev, void *data) { @@ -2955,12 +3042,92 @@ static int opal_activate_lsp(struct opal_dev *dev, return ret; } +static int opal_reactivate_lsp(struct opal_dev *dev, + struct opal_lr_react *opal_lr_react) +{ + const struct opal_step active_steps[] = { + { start_admin1LSP_opal_session, &opal_lr_react->key }, + { reactivate_lsp, opal_lr_react }, + /* No end_opal_session. The controller terminates the session */ + }; + int ret; + + /* use either 'entire_table' parameter or set of locking ranges */ + if (opal_lr_react->num_lrs > OPAL_MAX_LRS || + (opal_lr_react->num_lrs && opal_lr_react->entire_table)) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lr_react->key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_setup_locking_range(struct opal_dev *dev, struct opal_user_lr_setup *opal_lrs) { const struct opal_step lr_steps[] = { { start_auth_opal_session, &opal_lrs->session }, - { setup_locking_range, opal_lrs }, + { setup_locking_range_start_length, opal_lrs }, + { setup_enable_range, opal_lrs }, + { end_opal_session, } + }, lr_global_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_enable_range, opal_lrs }, + { end_opal_session, } + }; + int ret; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + if (opal_lrs->session.opal_key.lr == 0) + ret = execute_steps(dev, lr_global_steps, ARRAY_SIZE(lr_global_steps)); + else + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_setup_locking_range_start_length(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_locking_range_start_length, opal_lrs }, + { end_opal_session, } + }; + int ret; + + /* we can not set global locking range offset or length */ + if (opal_lrs->session.opal_key.lr == 0) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_enable_disable_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_enable_range, opal_lrs }, { end_opal_session, } }; int ret; @@ -3228,6 +3395,200 @@ static int opal_get_geometry(struct opal_dev *dev, void __user *data) return 0; } +static int get_sum_ranges(struct opal_dev *dev, void *data) +{ + const char *lr_uid; + size_t lr_uid_len; + u64 val; + const struct opal_resp_tok *tok; + int err, tok_n = 2; + struct opal_sum_ranges *sranges = data; + const __u8 lr_all[OPAL_MAX_LRS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; + + err = generic_get_columns(dev, opaluid[OPAL_LOCKING_INFO_TABLE], OPAL_SUM_SET_LIST, + OPAL_SUM_RANGE_POLICY); + if (err) { + pr_debug("Couldn't get locking info table columns %d to %d.\n", + OPAL_SUM_SET_LIST, OPAL_SUM_RANGE_POLICY); + return err; + } + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_STARTNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + if (response_get_u64(&dev->parsed, tok_n) != OPAL_SUM_SET_LIST) { + pr_debug("Token %d does not match expected column %u.\n", + tok_n, OPAL_SUM_SET_LIST); + return OPAL_INVAL_PARAM; + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + /* + * The OPAL_SUM_SET_LIST response contains two distinct values: + * + * - the list of individual locking ranges (UIDs) put in SUM. The list + * may also be empty signaling the SUM is disabled. + * + * - the Locking table UID if the entire Locking table is put in SUM. + */ + if (response_token_matches(tok, OPAL_STARTLIST)) { + sranges->num_lrs = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + while (!response_token_matches(tok, OPAL_ENDLIST)) { + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH)) { + if (lr_uid[5] != LOCKING_RANGE_NON_GLOBAL) { + pr_debug("Unexpected byte %d at LR UUID position 5.\n", + lr_uid[5]); + return OPAL_INVAL_PARAM; + } + sranges->lr[sranges->num_lrs++] = lr_uid[7]; + } else + sranges->lr[sranges->num_lrs++] = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + } + } else { + /* Only OPAL_LOCKING_TABLE UID is an alternative to OPAL_STARTLIST here. */ + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH)) { + pr_debug("Unexpected response UID.\n"); + return OPAL_INVAL_PARAM; + } + + /* sed-opal kernel API already provides following limit in Activate command */ + sranges->num_lrs = OPAL_MAX_LRS; + memcpy(sranges->lr, lr_all, OPAL_MAX_LRS); + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_ENDNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + err = response_get_column(&dev->parsed, &tok_n, OPAL_SUM_RANGE_POLICY, &val); + if (err) + return err; + + sranges->range_policy = val ? 1 : 0; + + return 0; +} + +static int opal_get_sum_ranges(struct opal_dev *dev, struct opal_sum_ranges *opal_sum_rngs, + void __user *data) +{ + const struct opal_step admin_steps[] = { + { start_admin1LSP_opal_session, &opal_sum_rngs->key }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }, anybody_steps[] = { + { start_anybodyLSP_opal_session, NULL }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + if (opal_sum_rngs->key.key_len) + /* Use Admin1 session (authenticated by PIN) to retrieve LockingInfo columns */ + ret = execute_steps(dev, admin_steps, ARRAY_SIZE(admin_steps)); + else + /* Use Anybody session (no key) to retrieve LockingInfo columns */ + ret = execute_steps(dev, anybody_steps, ARRAY_SIZE(anybody_steps)); + mutex_unlock(&dev->dev_lock); + + /* skip session info when copying back to uspace */ + if (!ret && copy_to_user(data + offsetof(struct opal_sum_ranges, num_lrs), + (void *)opal_sum_rngs + offsetof(struct opal_sum_ranges, num_lrs), + sizeof(*opal_sum_rngs) - offsetof(struct opal_sum_ranges, num_lrs))) { + pr_debug("Error copying SUM ranges info to userspace\n"); + return -EFAULT; + } + + return ret; +} + +static int opal_stack_reset(struct opal_dev *dev) +{ + struct opal_stack_reset *req; + struct opal_stack_reset_response *resp; + int ret; + + mutex_lock(&dev->dev_lock); + + memset(dev->cmd, 0, IO_BUFFER_LENGTH); + req = (struct opal_stack_reset *)dev->cmd; + req->extendedComID[0] = dev->comid >> 8; + req->extendedComID[1] = dev->comid & 0xFF; + req->request_code = cpu_to_be32(OPAL_STACK_RESET); + + ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02, + dev->cmd, IO_BUFFER_LENGTH, true); + if (ret) { + pr_debug("Error sending stack reset: %d\n", ret); + goto out; + } + + memset(dev->resp, 0, IO_BUFFER_LENGTH); + ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02, + dev->resp, IO_BUFFER_LENGTH, false); + if (ret) { + pr_debug("Error receiving stack reset response: %d\n", ret); + goto out; + } + + resp = (struct opal_stack_reset_response *)dev->resp; + if (be16_to_cpu(resp->data_length) != 4) { + pr_debug("Stack reset pending\n"); + ret = -EBUSY; + goto out; + } + if (be32_to_cpu(resp->response) != 0) { + pr_debug("Stack reset failed: %u\n", be32_to_cpu(resp->response)); + ret = -EIO; + } +out: + mutex_unlock(&dev->dev_lock); + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -3313,6 +3674,21 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_SET_SID_PW: ret = opal_set_new_sid_pw(dev, p); break; + case IOC_OPAL_REACTIVATE_LSP: + ret = opal_reactivate_lsp(dev, p); + break; + case IOC_OPAL_LR_SET_START_LEN: + ret = opal_setup_locking_range_start_length(dev, p); + break; + case IOC_OPAL_ENABLE_DISABLE_LR: + ret = opal_enable_disable_range(dev, p); + break; + case IOC_OPAL_GET_SUM_STATUS: + ret = opal_get_sum_ranges(dev, p, arg); + break; + case IOC_OPAL_STACK_RESET: + ret = opal_stack_reset(dev); + break; default: break; diff --git a/block/t10-pi.c b/block/t10-pi.c index d27be6041fd3..a19b4e102a83 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -12,230 +12,115 @@ #include #include "blk.h" -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; +#define APP_TAG_ESCAPE 0xffff +#define REF_TAG_ESCAPE 0xffffffff + +/* + * This union is used for onstack allocations when the pi field is split across + * segments. blk_validate_integrity_limits() guarantees pi_tuple_size matches + * the sizeof one of these two types. + */ +union pi_tuple { + struct crc64_pi_tuple crc64_pi; + struct t10_pi_tuple t10_pi; }; -static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, - unsigned char csum_type) +struct blk_integrity_iter { + struct bio *bio; + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct bvec_iter data_iter; + struct bvec_iter prot_iter; + unsigned int interval_remaining; + u64 seed; + u64 csum; +}; + +static void blk_calculate_guard(struct blk_integrity_iter *iter, void *data, + unsigned int len) { - if (csum_type == BLK_INTEGRITY_CSUM_IP) - return (__force __be16)ip_compute_csum(data, len); - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + iter->csum = crc64_nvme(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_CRC: + iter->csum = crc_t10dif_update(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u32)csum_partial(data, len, + (__force __wsum)iter->csum); + break; + default: + WARN_ON_ONCE(1); + iter->csum = U64_MAX; + break; + } +} + +static void blk_integrity_csum_finish(struct blk_integrity_iter *iter) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u16)csum_fold((__force __wsum)iter->csum); + break; + default: + break; + } } /* - * Type 1 and Type 2 protection use the same format: 16 bit guard tag, - * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref - * tag. + * Update the csum for formats that have metadata padding in front of the data + * integrity field */ -static void t10_pi_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_integrity_csum_offset(struct blk_integrity_iter *iter) { - u8 offset = bi->pi_offset; - unsigned int i; + unsigned int offset = iter->bi->pi_offset; + struct bio_vec *bvec = iter->bip->bip_vec; - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; + while (offset > 0) { + struct bio_vec pbv = bvec_iter_bvec(bvec, iter->prot_iter); + unsigned int len = min(pbv.bv_len, offset); + void *prot_buf = bvec_kmap_local(&pbv); - pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - pi->guard_tag = t10_pi_csum(pi->guard_tag, - iter->prot_buf, offset, bi->csum_type); - pi->app_tag = 0; + blk_calculate_guard(iter, prot_buf, len); + kunmap_local(prot_buf); + offset -= len; + bvec_iter_advance_single(bvec, &iter->prot_iter, len); + } + blk_integrity_csum_finish(iter); +} - if (bi->flags & BLK_INTEGRITY_REF_TAG) - pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); - else - pi->ref_tag = 0; +static void blk_integrity_copy_from_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) +{ + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + memcpy(prot_buf, tuple, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; } } -static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_integrity_copy_to_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) { - u8 offset = bi->pi_offset; - unsigned int i; + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; - __be16 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - if (be32_to_cpu(pi->ref_tag) != - lower_32_bits(iter->seed)) { - pr_err("%s: ref tag error at location %llu " \ - "(rcvd %u)\n", iter->disk_name, - (unsigned long long) - iter->seed, be32_to_cpu(pi->ref_tag)); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - pi->ref_tag == T10_PI_REF_ESCAPE) - goto next; - } - - csum = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - csum = t10_pi_csum(csum, iter->prot_buf, offset, - bi->csum_type); - - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %04x, want %04x)\n", iter->disk_name, - (unsigned long long)iter->seed, - be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); - return BLK_STS_PROTECTION; - } - -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; - } - - return BLK_STS_OK; -} - -/** - * t10_pi_type1_prepare - prepare PI prior submitting request to device - * @rq: request with PI that should be prepared - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Remap protection information to match the - * physical LBA. - */ -static void t10_pi_type1_prepare(struct request *rq) -{ - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; - - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; - - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == virt) - pi->ref_tag = cpu_to_be32(ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); - } - - bip->bip_flags |= BIP_MAPPED_INTEGRITY; - } -} - -/** - * t10_pi_type1_complete - prepare PI prior returning request to the blk layer - * @rq: request with PI that should be prepared - * @nr_bytes: total bytes to prepare - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Since the physical start sector was submitted - * to the device, we should remap it back to virtual values expected by the - * block layer. - */ -static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) -{ - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; - - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == ref_tag) - pi->ref_tag = cpu_to_be32(virt); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } - } -} - -static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) -{ - return cpu_to_be64(crc64_nvme(crc, data, len)); -} - -static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) -{ - u8 offset = bi->pi_offset; - unsigned int i; - - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; - - pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), - iter->prot_buf, offset); - pi->app_tag = 0; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) - put_unaligned_be48(iter->seed, pi->ref_tag); - else - put_unaligned_be48(0ULL, pi->ref_tag); - - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + memcpy(tuple, prot_buf, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; } } @@ -246,228 +131,437 @@ static bool ext_pi_ref_escape(const u8 ref_tag[6]) return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; } -static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static blk_status_t blk_verify_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) { - u8 offset = bi->pi_offset; - unsigned int i; + u64 seed = lower_48_bits(iter->seed); + u64 guard = get_unaligned_be64(&pi->guard_tag); + u64 ref = get_unaligned_be48(pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); - for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; - u64 ref, seed; - __be64 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - ref = get_unaligned_be48(pi->ref_tag); - seed = lower_48_bits(iter->seed); - if (ref != seed) { - pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", - iter->disk_name, seed, ref); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - ext_pi_ref_escape(pi->ref_tag)) - goto next; - } - - csum = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, - offset); - - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %016llx, want %016llx)\n", - iter->disk_name, (unsigned long long)iter->seed, - be64_to_cpu(pi->guard_tag), be64_to_cpu(csum)); + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) { + return BLK_STS_OK; + } -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + if (guard != iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %016llx, want %016llx)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, iter->csum); + return BLK_STS_PROTECTION; } return BLK_STS_OK; } -static void ext_pi_type1_prepare(struct request *rq) +static blk_status_t blk_verify_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, u16 guard) { - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + u32 seed = lower_32_bits(iter->seed); + u32 ref = get_unaligned_be32(&pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; - - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); - - if (ref == virt) - put_unaligned_be48(ref_tag, pi->ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %u (rcvd %u)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); + return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ref == REF_TAG_ESCAPE) { + return BLK_STS_OK; + } - bip->bip_flags |= BIP_MAPPED_INTEGRITY; + if (guard != (u16)iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %04x, want %04x)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, (u16)iter->csum); + return BLK_STS_PROTECTION; + } + + return BLK_STS_OK; +} + +static blk_status_t blk_verify_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + u16 guard = get_unaligned_be16(&pi->guard_tag); + + return blk_verify_pi(iter, pi, guard); +} + +static blk_status_t blk_verify_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + u16 guard = get_unaligned((u16 *)&pi->guard_tag); + + return blk_verify_pi(iter, pi, guard); +} + +static blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_verify_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_verify_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_verify_ip_pi(iter, &tuple->t10_pi); + default: + return BLK_STS_OK; } } -static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +static void blk_set_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) { - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + put_unaligned_be64(iter->csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be48(iter->seed, &pi->ref_tag); +} - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; +static void blk_set_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, __be16 csum) +{ + put_unaligned(csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be32(iter->seed, &pi->ref_tag); +} - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; +static void blk_set_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + blk_set_pi(iter, pi, cpu_to_be16((u16)iter->csum)); +} - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); +static void blk_set_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + blk_set_pi(iter, pi, (__force __be16)(u16)iter->csum); +} - if (ref == ref_tag) - put_unaligned_be48(virt, pi->ref_tag); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } +static void blk_integrity_set(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_set_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_set_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_set_ip_pi(iter, &tuple->t10_pi); + default: + WARN_ON_ONCE(1); + return; } } +static blk_status_t blk_integrity_interval(struct blk_integrity_iter *iter, + bool verify) +{ + blk_status_t ret = BLK_STS_OK; + union pi_tuple tuple; + void *ptuple = &tuple; + struct bio_vec pbv; + + blk_integrity_csum_offset(iter); + pbv = bvec_iter_bvec(iter->bip->bip_vec, iter->prot_iter); + if (pbv.bv_len >= iter->bi->pi_tuple_size) { + ptuple = bvec_kmap_local(&pbv); + bvec_iter_advance_single(iter->bip->bip_vec, &iter->prot_iter, + iter->bi->metadata_size - iter->bi->pi_offset); + } else if (verify) { + blk_integrity_copy_to_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); + } + + if (verify) + ret = blk_integrity_verify(iter, ptuple); + else + blk_integrity_set(iter, ptuple); + + if (likely(ptuple != &tuple)) { + kunmap_local(ptuple); + } else if (!verify) { + blk_integrity_copy_from_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); + } + + iter->interval_remaining = 1 << iter->bi->interval_exp; + iter->csum = 0; + iter->seed++; + return ret; +} + +static blk_status_t blk_integrity_iterate(struct bio *bio, + struct bvec_iter *data_iter, + bool verify) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter = { + .bio = bio, + .bip = bip, + .bi = bi, + .data_iter = *data_iter, + .prot_iter = bip->bip_iter, + .interval_remaining = 1 << bi->interval_exp, + .seed = data_iter->bi_sector, + .csum = 0, + }; + blk_status_t ret = BLK_STS_OK; + + while (iter.data_iter.bi_size && ret == BLK_STS_OK) { + struct bio_vec bv = bvec_iter_bvec(iter.bio->bi_io_vec, + iter.data_iter); + void *kaddr = bvec_kmap_local(&bv); + void *data = kaddr; + unsigned int len; + + bvec_iter_advance_single(iter.bio->bi_io_vec, &iter.data_iter, + bv.bv_len); + while (bv.bv_len && ret == BLK_STS_OK) { + len = min(iter.interval_remaining, bv.bv_len); + blk_calculate_guard(&iter, data, len); + bv.bv_len -= len; + data += len; + iter.interval_remaining -= len; + if (!iter.interval_remaining) + ret = blk_integrity_interval(&iter, verify); + } + kunmap_local(kaddr); + } + + return ret; +} + void bio_integrity_generate(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = bio->bi_iter.bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - bio_for_each_segment(bv, bio, bviter) { - void *kaddr = bvec_kmap_local(&bv); - - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ext_pi_crc64_generate(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - t10_pi_generate(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_integrity_iterate(bio, &bio->bi_iter, false); + break; + default: + break; } } blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - /* - * At the moment verify is called bi_iter has been advanced during split - * and completion, so use the copy created during submission here. - */ - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = saved_iter->bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, *saved_iter) { - void *kaddr = bvec_kmap_local(&bv); - blk_status_t ret = BLK_STS_OK; - - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ret = ext_pi_crc64_verify(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - ret = t10_pi_verify(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); - - if (ret) - return ret; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return blk_integrity_iterate(bio, saved_iter, true); + default: + break; } return BLK_STS_OK; } -void blk_integrity_prepare(struct request *rq) +/* + * Advance @iter past the protection offset for protection formats that + * contain front padding on the metadata region. + */ +static void blk_pi_advance_offset(struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + unsigned int offset = bi->pi_offset; + + while (offset > 0) { + struct bio_vec bv = mp_bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(bv.bv_len, offset); + + bvec_iter_advance_single(bip->bip_vec, iter, len); + offset -= len; + } +} + +static void *blk_tuple_remap_begin(union pi_tuple *tuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + struct bvec_iter titer; + struct bio_vec pbv; + + blk_pi_advance_offset(bi, bip, iter); + pbv = bvec_iter_bvec(bip->bip_vec, *iter); + if (likely(pbv.bv_len >= bi->pi_tuple_size)) + return bvec_kmap_local(&pbv); + + /* + * We need to preserve the state of the original iter for the + * copy_from_tuple at the end, so make a temp iter for here. + */ + titer = *iter; + blk_integrity_copy_to_tuple(bip, &titer, tuple, bi->pi_tuple_size); + return tuple; +} + +static void blk_tuple_remap_end(union pi_tuple *tuple, void *ptuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + unsigned int len = bi->metadata_size - bi->pi_offset; + + if (likely(ptuple != tuple)) { + kunmap_local(ptuple); + } else { + blk_integrity_copy_from_tuple(bip, iter, ptuple, + bi->pi_tuple_size); + len -= bi->pi_tuple_size; + } + + bvec_iter_advance(bip->bip_vec, iter, len); +} + +static void blk_set_ext_unmap_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) +{ + u64 ref = get_unaligned_be48(&pi->ref_tag); + + if (ref == lower_48_bits(ref_tag) && ref != lower_48_bits(virt)) + put_unaligned_be48(virt, pi->ref_tag); +} + +static void blk_set_t10_unmap_ref(struct t10_pi_tuple *pi, u32 virt, + u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == ref_tag && ref != virt) + put_unaligned_be32(virt, &pi->ref_tag); +} + +static void blk_reftag_remap_complete(struct blk_integrity *bi, + union pi_tuple *tuple, u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_unmap_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_unmap_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void blk_set_ext_map_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) +{ + u64 ref = get_unaligned_be48(&pi->ref_tag); + + if (ref == lower_48_bits(virt) && ref != ref_tag) + put_unaligned_be48(ref_tag, pi->ref_tag); +} + +static void blk_set_t10_map_ref(struct t10_pi_tuple *pi, u32 virt, u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == virt && ref != ref_tag) + put_unaligned_be32(ref_tag, &pi->ref_tag); +} + +static void blk_reftag_remap_prepare(struct blk_integrity *bi, + union pi_tuple *tuple, + u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_map_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_map_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void __blk_reftag_remap(struct bio *bio, struct blk_integrity *bi, + unsigned *intervals, u64 *ref, bool prep) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bvec_iter iter = bip->bip_iter; + u64 virt = bip_get_seed(bip); + union pi_tuple *ptuple; + union pi_tuple tuple; + + if (prep && bip->bip_flags & BIP_MAPPED_INTEGRITY) { + *ref += bio->bi_iter.bi_size >> bi->interval_exp; + return; + } + + while (iter.bi_size && *intervals) { + ptuple = blk_tuple_remap_begin(&tuple, bi, bip, &iter); + + if (prep) + blk_reftag_remap_prepare(bi, ptuple, virt, *ref); + else + blk_reftag_remap_complete(bi, ptuple, virt, *ref); + + blk_tuple_remap_end(&tuple, ptuple, bi, bip, &iter); + (*intervals)--; + (*ref)++; + virt++; + } + + if (prep) + bip->bip_flags |= BIP_MAPPED_INTEGRITY; +} + +static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes, + bool prep) { struct blk_integrity *bi = &rq->q->limits.integrity; + u64 ref = blk_rq_pos(rq) >> (bi->interval_exp - SECTOR_SHIFT); + unsigned intervals = nr_bytes >> bi->interval_exp; + struct bio *bio; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_prepare(rq); - else - t10_pi_type1_prepare(rq); + __rq_for_each_bio(bio, rq) { + __blk_reftag_remap(bio, bi, &intervals, &ref, prep); + if (!intervals) + break; + } +} + +void blk_integrity_prepare(struct request *rq) +{ + blk_integrity_remap(rq, blk_rq_bytes(rq), true); } void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->limits.integrity; - - if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) - return; - - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_complete(rq, nr_bytes); - else - t10_pi_type1_complete(rq, nr_bytes); + blk_integrity_remap(rq, nr_bytes, false); } diff --git a/crypto/Kconfig b/crypto/Kconfig index b4bb85e8e226..7e5ea61168c3 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -141,12 +141,6 @@ config CRYPTO_ACOMP select CRYPTO_ALGAPI select CRYPTO_ACOMP2 -config CRYPTO_HKDF - tristate - select CRYPTO_SHA256 if CRYPTO_SELFTESTS - select CRYPTO_SHA512 if CRYPTO_SELFTESTS - select CRYPTO_HASH2 - config CRYPTO_MANAGER tristate default CRYPTO_ALGAPI if CRYPTO_SELFTESTS diff --git a/crypto/Makefile b/crypto/Makefile index 04e269117589..8eb3f9a629d8 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -36,7 +36,6 @@ obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o obj-$(CONFIG_CRYPTO_AKCIPHER2) += akcipher.o obj-$(CONFIG_CRYPTO_SIG2) += sig.o obj-$(CONFIG_CRYPTO_KPP2) += kpp.o -obj-$(CONFIG_CRYPTO_HKDF) += hkdf.o dh_generic-y := dh.o dh_generic-y += dh_helper.o diff --git a/crypto/hkdf.c b/crypto/hkdf.c deleted file mode 100644 index 82d1b32ca6ce..000000000000 --- a/crypto/hkdf.c +++ /dev/null @@ -1,573 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation - * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): - * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". - * - * Copyright 2019 Google LLC - */ - -#include -#include -#include -#include - -/* - * HKDF consists of two steps: - * - * 1. HKDF-Extract: extract a pseudorandom key from the input keying material - * and optional salt. - * 2. HKDF-Expand: expand the pseudorandom key into output keying material of - * any length, parameterized by an application-specific info string. - * - */ - -/** - * hkdf_extract - HKDF-Extract (RFC 5869 section 2.2) - * @hmac_tfm: an HMAC transform using the hash function desired for HKDF. The - * caller is responsible for setting the @prk afterwards. - * @ikm: input keying material - * @ikmlen: length of @ikm - * @salt: input salt value - * @saltlen: length of @salt - * @prk: resulting pseudorandom key - * - * Extracts a pseudorandom key @prk from the input keying material - * @ikm with length @ikmlen and salt @salt with length @saltlen. - * The length of @prk is given by the digest size of @hmac_tfm. - * For an 'unsalted' version of HKDF-Extract @salt must be set - * to all zeroes and @saltlen must be set to the length of @prk. - * - * Returns 0 on success with the pseudorandom key stored in @prk, - * or a negative errno value otherwise. - */ -int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, const u8 *salt, unsigned int saltlen, - u8 *prk) -{ - int err; - - err = crypto_shash_setkey(hmac_tfm, salt, saltlen); - if (!err) - err = crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); - - return err; -} -EXPORT_SYMBOL_GPL(hkdf_extract); - -/** - * hkdf_expand - HKDF-Expand (RFC 5869 section 2.3) - * @hmac_tfm: hash context keyed with pseudorandom key - * @info: application-specific information - * @infolen: length of @info - * @okm: output keying material - * @okmlen: length of @okm - * - * This expands the pseudorandom key, which was already keyed into @hmac_tfm, - * into @okmlen bytes of output keying material parameterized by the - * application-specific @info of length @infolen bytes. - * This is thread-safe and may be called by multiple threads in parallel. - * - * Returns 0 on success with output keying material stored in @okm, - * or a negative errno value otherwise. - */ -int hkdf_expand(struct crypto_shash *hmac_tfm, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen) -{ - SHASH_DESC_ON_STACK(desc, hmac_tfm); - unsigned int i, hashlen = crypto_shash_digestsize(hmac_tfm); - int err; - const u8 *prev = NULL; - u8 counter = 1; - u8 tmp[HASH_MAX_DIGESTSIZE] = {}; - - if (WARN_ON(okmlen > 255 * hashlen)) - return -EINVAL; - - desc->tfm = hmac_tfm; - - for (i = 0; i < okmlen; i += hashlen) { - err = crypto_shash_init(desc); - if (err) - goto out; - - if (prev) { - err = crypto_shash_update(desc, prev, hashlen); - if (err) - goto out; - } - - if (infolen) { - err = crypto_shash_update(desc, info, infolen); - if (err) - goto out; - } - - BUILD_BUG_ON(sizeof(counter) != 1); - if (okmlen - i < hashlen) { - err = crypto_shash_finup(desc, &counter, 1, tmp); - if (err) - goto out; - memcpy(&okm[i], tmp, okmlen - i); - memzero_explicit(tmp, sizeof(tmp)); - } else { - err = crypto_shash_finup(desc, &counter, 1, &okm[i]); - if (err) - goto out; - } - counter++; - prev = &okm[i]; - } - err = 0; -out: - if (unlikely(err)) - memzero_explicit(okm, okmlen); /* so caller doesn't need to */ - shash_desc_zero(desc); - memzero_explicit(tmp, HASH_MAX_DIGESTSIZE); - return err; -} -EXPORT_SYMBOL_GPL(hkdf_expand); - -struct hkdf_testvec { - const char *test; - const u8 *ikm; - const u8 *salt; - const u8 *info; - const u8 *prk; - const u8 *okm; - u16 ikm_size; - u16 salt_size; - u16 info_size; - u16 prk_size; - u16 okm_size; -}; - -/* - * HKDF test vectors from RFC5869 - * - * Additional HKDF test vectors from - * https://github.com/brycx/Test-Vector-Generation/blob/master/HKDF/hkdf-hmac-sha2-test-vectors.md - */ -static const struct hkdf_testvec hkdf_sha256_tv[] = { - { - .test = "basic hdkf test", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x07\x77\x09\x36\x2c\x2e\x32\xdf\x0d\xdc\x3f\x0d\xc4\x7b\xba\x63" - "\x90\xb6\xc7\x3b\xb5\x0f\x9c\x31\x22\xec\x84\x4a\xd7\xc2\xb3\xe5", - .prk_size = 32, - .okm = "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a\x90\x43\x4f\x64\xd0\x36\x2f\x2a" - "\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf" - "\x34\x00\x72\x08\xd5\xb8\x87\x18\x58\x65", - .okm_size = 42, - }, { - .test = "hkdf test with long input", - .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" - "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" - "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", - .ikm_size = 80, - .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" - "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" - "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" - "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" - "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", - .salt_size = 80, - .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" - "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" - "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" - "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", - .info_size = 80, - .prk = "\x06\xa6\xb8\x8c\x58\x53\x36\x1a\x06\x10\x4c\x9c\xeb\x35\xb4\x5c" - "\xef\x76\x00\x14\x90\x46\x71\x01\x4a\x19\x3f\x40\xc1\x5f\xc2\x44", - .prk_size = 32, - .okm = "\xb1\x1e\x39\x8d\xc8\x03\x27\xa1\xc8\xe7\xf7\x8c\x59\x6a\x49\x34" - "\x4f\x01\x2e\xda\x2d\x4e\xfa\xd8\xa0\x50\xcc\x4c\x19\xaf\xa9\x7c" - "\x59\x04\x5a\x99\xca\xc7\x82\x72\x71\xcb\x41\xc6\x5e\x59\x0e\x09" - "\xda\x32\x75\x60\x0c\x2f\x09\xb8\x36\x77\x93\xa9\xac\xa3\xdb\x71" - "\xcc\x30\xc5\x81\x79\xec\x3e\x87\xc1\x4c\x01\xd5\xc1\xf3\x43\x4f" - "\x1d\x87", - .okm_size = 82, - }, { - .test = "hkdf test with zero salt and info", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = NULL, - .salt_size = 0, - .info = NULL, - .info_size = 0, - .prk = "\x19\xef\x24\xa3\x2c\x71\x7b\x16\x7f\x33\xa9\x1d\x6f\x64\x8b\xdf" - "\x96\x59\x67\x76\xaf\xdb\x63\x77\xac\x43\x4c\x1c\x29\x3c\xcb\x04", - .prk_size = 32, - .okm = "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f\x71\x5f\x80\x2a\x06\x3c\x5a\x31" - "\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d" - "\x9d\x20\x13\x95\xfa\xa4\xb6\x1a\x96\xc8", - .okm_size = 42, - }, { - .test = "hkdf test with short input", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 11, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x82\x65\xf6\x9d\x7f\xf7\xe5\x01\x37\x93\x01\x5c\xa0\xef\x92\x0c" - "\xb1\x68\x21\x99\xc8\xbc\x3a\x00\xda\x0c\xab\x47\xb7\xb0\x0f\xdf", - .prk_size = 32, - .okm = "\x58\xdc\xe1\x0d\x58\x01\xcd\xfd\xa8\x31\x72\x6b\xfe\xbc\xb7\x43" - "\xd1\x4a\x7e\xe8\x3a\xa0\x57\xa9\x3d\x59\xb0\xa1\x31\x7f\xf0\x9d" - "\x10\x5c\xce\xcf\x53\x56\x92\xb1\x4d\xd5", - .okm_size = 42, - }, { - .test = "unsalted hkdf test with zero info", - .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" - "\x0c\x0c\x0c\x0c\x0c\x0c", - .ikm_size = 22, - .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - .salt_size = 32, - .info = NULL, - .info_size = 0, - .prk = "\xaa\x84\x1e\x1f\x35\x74\xf3\x2d\x13\xfb\xa8\x00\x5f\xcd\x9b\x8d" - "\x77\x67\x82\xa5\xdf\xa1\x92\x38\x92\xfd\x8b\x63\x5d\x3a\x89\xdf", - .prk_size = 32, - .okm = "\x59\x68\x99\x17\x9a\xb1\xbc\x00\xa7\xc0\x37\x86\xff\x43\xee\x53" - "\x50\x04\xbe\x2b\xb9\xbe\x68\xbc\x14\x06\x63\x6f\x54\xbd\x33\x8a" - "\x66\xa2\x37\xba\x2a\xcb\xce\xe3\xc9\xa7", - .okm_size = 42, - } -}; - -static const struct hkdf_testvec hkdf_sha384_tv[] = { - { - .test = "basic hkdf test", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x70\x4b\x39\x99\x07\x79\xce\x1d\xc5\x48\x05\x2c\x7d\xc3\x9f\x30" - "\x35\x70\xdd\x13\xfb\x39\xf7\xac\xc5\x64\x68\x0b\xef\x80\xe8\xde" - "\xc7\x0e\xe9\xa7\xe1\xf3\xe2\x93\xef\x68\xec\xeb\x07\x2a\x5a\xde", - .prk_size = 48, - .okm = "\x9b\x50\x97\xa8\x60\x38\xb8\x05\x30\x90\x76\xa4\x4b\x3a\x9f\x38" - "\x06\x3e\x25\xb5\x16\xdc\xbf\x36\x9f\x39\x4c\xfa\xb4\x36\x85\xf7" - "\x48\xb6\x45\x77\x63\xe4\xf0\x20\x4f\xc5", - .okm_size = 42, - }, { - .test = "hkdf test with long input", - .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" - "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" - "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", - .ikm_size = 80, - .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" - "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" - "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" - "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" - "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", - .salt_size = 80, - .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" - "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" - "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" - "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", - .info_size = 80, - .prk = "\xb3\x19\xf6\x83\x1d\xff\x93\x14\xef\xb6\x43\xba\xa2\x92\x63\xb3" - "\x0e\x4a\x8d\x77\x9f\xe3\x1e\x9c\x90\x1e\xfd\x7d\xe7\x37\xc8\x5b" - "\x62\xe6\x76\xd4\xdc\x87\xb0\x89\x5c\x6a\x7d\xc9\x7b\x52\xce\xbb", - .prk_size = 48, - .okm = "\x48\x4c\xa0\x52\xb8\xcc\x72\x4f\xd1\xc4\xec\x64\xd5\x7b\x4e\x81" - "\x8c\x7e\x25\xa8\xe0\xf4\x56\x9e\xd7\x2a\x6a\x05\xfe\x06\x49\xee" - "\xbf\x69\xf8\xd5\xc8\x32\x85\x6b\xf4\xe4\xfb\xc1\x79\x67\xd5\x49" - "\x75\x32\x4a\x94\x98\x7f\x7f\x41\x83\x58\x17\xd8\x99\x4f\xdb\xd6" - "\xf4\xc0\x9c\x55\x00\xdc\xa2\x4a\x56\x22\x2f\xea\x53\xd8\x96\x7a" - "\x8b\x2e", - .okm_size = 82, - }, { - .test = "hkdf test with zero salt and info", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = NULL, - .salt_size = 0, - .info = NULL, - .info_size = 0, - .prk = "\x10\xe4\x0c\xf0\x72\xa4\xc5\x62\x6e\x43\xdd\x22\xc1\xcf\x72\x7d" - "\x4b\xb1\x40\x97\x5c\x9a\xd0\xcb\xc8\xe4\x5b\x40\x06\x8f\x8f\x0b" - "\xa5\x7c\xdb\x59\x8a\xf9\xdf\xa6\x96\x3a\x96\x89\x9a\xf0\x47\xe5", - .prk_size = 48, - .okm = "\xc8\xc9\x6e\x71\x0f\x89\xb0\xd7\x99\x0b\xca\x68\xbc\xde\xc8\xcf" - "\x85\x40\x62\xe5\x4c\x73\xa7\xab\xc7\x43\xfa\xde\x9b\x24\x2d\xaa" - "\xcc\x1c\xea\x56\x70\x41\x5b\x52\x84\x9c", - .okm_size = 42, - }, { - .test = "hkdf test with short input", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 11, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x6d\x31\x69\x98\x28\x79\x80\x88\xb3\x59\xda\xd5\x0b\x8f\x01\xb0" - "\x15\xf1\x7a\xa3\xbd\x4e\x27\xa6\xe9\xf8\x73\xb7\x15\x85\xca\x6a" - "\x00\xd1\xf0\x82\x12\x8a\xdb\x3c\xf0\x53\x0b\x57\xc0\xf9\xac\x72", - .prk_size = 48, - .okm = "\xfb\x7e\x67\x43\xeb\x42\xcd\xe9\x6f\x1b\x70\x77\x89\x52\xab\x75" - "\x48\xca\xfe\x53\x24\x9f\x7f\xfe\x14\x97\xa1\x63\x5b\x20\x1f\xf1" - "\x85\xb9\x3e\x95\x19\x92\xd8\x58\xf1\x1a", - .okm_size = 42, - }, { - .test = "unsalted hkdf test with zero info", - .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" - "\x0c\x0c\x0c\x0c\x0c\x0c", - .ikm_size = 22, - .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - .salt_size = 48, - .info = NULL, - .info_size = 0, - .prk = "\x9d\x2d\xa5\x06\x6f\x05\xd1\x6c\x59\xfe\xdf\x6c\x5f\x32\xc7\x5e" - "\xda\x9a\x47\xa7\x9c\x93\x6a\xa4\x4c\xb7\x63\xa8\xe2\x2f\xfb\xfc" - "\xd8\xfe\x55\x43\x58\x53\x47\x21\x90\x39\xd1\x68\x28\x36\x33\xf5", - .prk_size = 48, - .okm = "\x6a\xd7\xc7\x26\xc8\x40\x09\x54\x6a\x76\xe0\x54\x5d\xf2\x66\x78" - "\x7e\x2b\x2c\xd6\xca\x43\x73\xa1\xf3\x14\x50\xa7\xbd\xf9\x48\x2b" - "\xfa\xb8\x11\xf5\x54\x20\x0e\xad\x8f\x53", - .okm_size = 42, - } -}; - -static const struct hkdf_testvec hkdf_sha512_tv[] = { - { - .test = "basic hkdf test", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x66\x57\x99\x82\x37\x37\xde\xd0\x4a\x88\xe4\x7e\x54\xa5\x89\x0b" - "\xb2\xc3\xd2\x47\xc7\xa4\x25\x4a\x8e\x61\x35\x07\x23\x59\x0a\x26" - "\xc3\x62\x38\x12\x7d\x86\x61\xb8\x8c\xf8\x0e\xf8\x02\xd5\x7e\x2f" - "\x7c\xeb\xcf\x1e\x00\xe0\x83\x84\x8b\xe1\x99\x29\xc6\x1b\x42\x37", - .prk_size = 64, - .okm = "\x83\x23\x90\x08\x6c\xda\x71\xfb\x47\x62\x5b\xb5\xce\xb1\x68\xe4" - "\xc8\xe2\x6a\x1a\x16\xed\x34\xd9\xfc\x7f\xe9\x2c\x14\x81\x57\x93" - "\x38\xda\x36\x2c\xb8\xd9\xf9\x25\xd7\xcb", - .okm_size = 42, - }, { - .test = "hkdf test with long input", - .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" - "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" - "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", - .ikm_size = 80, - .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" - "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" - "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" - "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" - "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", - .salt_size = 80, - .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" - "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" - "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" - "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", - .info_size = 80, - .prk = "\x35\x67\x25\x42\x90\x7d\x4e\x14\x2c\x00\xe8\x44\x99\xe7\x4e\x1d" - "\xe0\x8b\xe8\x65\x35\xf9\x24\xe0\x22\x80\x4a\xd7\x75\xdd\xe2\x7e" - "\xc8\x6c\xd1\xe5\xb7\xd1\x78\xc7\x44\x89\xbd\xbe\xb3\x07\x12\xbe" - "\xb8\x2d\x4f\x97\x41\x6c\x5a\x94\xea\x81\xeb\xdf\x3e\x62\x9e\x4a", - .prk_size = 64, - .okm = "\xce\x6c\x97\x19\x28\x05\xb3\x46\xe6\x16\x1e\x82\x1e\xd1\x65\x67" - "\x3b\x84\xf4\x00\xa2\xb5\x14\xb2\xfe\x23\xd8\x4c\xd1\x89\xdd\xf1" - "\xb6\x95\xb4\x8c\xbd\x1c\x83\x88\x44\x11\x37\xb3\xce\x28\xf1\x6a" - "\xa6\x4b\xa3\x3b\xa4\x66\xb2\x4d\xf6\xcf\xcb\x02\x1e\xcf\xf2\x35" - "\xf6\xa2\x05\x6c\xe3\xaf\x1d\xe4\x4d\x57\x20\x97\xa8\x50\x5d\x9e" - "\x7a\x93", - .okm_size = 82, - }, { - .test = "hkdf test with zero salt and info", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = NULL, - .salt_size = 0, - .info = NULL, - .info_size = 0, - .prk = "\xfd\x20\x0c\x49\x87\xac\x49\x13\x13\xbd\x4a\x2a\x13\x28\x71\x21" - "\x24\x72\x39\xe1\x1c\x9e\xf8\x28\x02\x04\x4b\x66\xef\x35\x7e\x5b" - "\x19\x44\x98\xd0\x68\x26\x11\x38\x23\x48\x57\x2a\x7b\x16\x11\xde" - "\x54\x76\x40\x94\x28\x63\x20\x57\x8a\x86\x3f\x36\x56\x2b\x0d\xf6", - .prk_size = 64, - .okm = "\xf5\xfa\x02\xb1\x82\x98\xa7\x2a\x8c\x23\x89\x8a\x87\x03\x47\x2c" - "\x6e\xb1\x79\xdc\x20\x4c\x03\x42\x5c\x97\x0e\x3b\x16\x4b\xf9\x0f" - "\xff\x22\xd0\x48\x36\xd0\xe2\x34\x3b\xac", - .okm_size = 42, - }, { - .test = "hkdf test with short input", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 11, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x67\x40\x9c\x9c\xac\x28\xb5\x2e\xe9\xfa\xd9\x1c\x2f\xda\x99\x9f" - "\x7c\xa2\x2e\x34\x34\xf0\xae\x77\x28\x63\x83\x65\x68\xad\x6a\x7f" - "\x10\xcf\x11\x3b\xfd\xdd\x56\x01\x29\xa5\x94\xa8\xf5\x23\x85\xc2" - "\xd6\x61\xd7\x85\xd2\x9c\xe9\x3a\x11\x40\x0c\x92\x06\x83\x18\x1d", - .prk_size = 64, - .okm = "\x74\x13\xe8\x99\x7e\x02\x06\x10\xfb\xf6\x82\x3f\x2c\xe1\x4b\xff" - "\x01\x87\x5d\xb1\xca\x55\xf6\x8c\xfc\xf3\x95\x4d\xc8\xaf\xf5\x35" - "\x59\xbd\x5e\x30\x28\xb0\x80\xf7\xc0\x68", - .okm_size = 42, - }, { - .test = "unsalted hkdf test with zero info", - .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" - "\x0c\x0c\x0c\x0c\x0c\x0c", - .ikm_size = 22, - .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - .salt_size = 64, - .info = NULL, - .info_size = 0, - .prk = "\x53\x46\xb3\x76\xbf\x3a\xa9\xf8\x4f\x8f\x6e\xd5\xb1\xc4\xf4\x89" - "\x17\x2e\x24\x4d\xac\x30\x3d\x12\xf6\x8e\xcc\x76\x6e\xa6\x00\xaa" - "\x88\x49\x5e\x7f\xb6\x05\x80\x31\x22\xfa\x13\x69\x24\xa8\x40\xb1" - "\xf0\x71\x9d\x2d\x5f\x68\xe2\x9b\x24\x22\x99\xd7\x58\xed\x68\x0c", - .prk_size = 64, - .okm = "\x14\x07\xd4\x60\x13\xd9\x8b\xc6\xde\xce\xfc\xfe\xe5\x5f\x0f\x90" - "\xb0\xc7\xf6\x3d\x68\xeb\x1a\x80\xea\xf0\x7e\x95\x3c\xfc\x0a\x3a" - "\x52\x40\xa1\x55\xd6\xe4\xda\xa9\x65\xbb", - .okm_size = 42, - } -}; - -static int hkdf_test(const char *shash, const struct hkdf_testvec *tv) -{ struct crypto_shash *tfm = NULL; - u8 *prk = NULL, *okm = NULL; - unsigned int prk_size; - const char *driver; - int err; - - tfm = crypto_alloc_shash(shash, 0, 0); - if (IS_ERR(tfm)) { - pr_err("%s(%s): failed to allocate transform: %ld\n", - tv->test, shash, PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - driver = crypto_shash_driver_name(tfm); - - prk_size = crypto_shash_digestsize(tfm); - prk = kzalloc(prk_size, GFP_KERNEL); - if (!prk) { - err = -ENOMEM; - goto out_free; - } - - if (tv->prk_size != prk_size) { - pr_err("%s(%s): prk size mismatch (vec %u, digest %u\n", - tv->test, driver, tv->prk_size, prk_size); - err = -EINVAL; - goto out_free; - } - - err = hkdf_extract(tfm, tv->ikm, tv->ikm_size, - tv->salt, tv->salt_size, prk); - if (err) { - pr_err("%s(%s): hkdf_extract failed with %d\n", - tv->test, driver, err); - goto out_free; - } - - if (memcmp(prk, tv->prk, tv->prk_size)) { - pr_err("%s(%s): hkdf_extract prk mismatch\n", - tv->test, driver); - print_hex_dump(KERN_ERR, "prk: ", DUMP_PREFIX_NONE, - 16, 1, prk, tv->prk_size, false); - err = -EINVAL; - goto out_free; - } - - okm = kzalloc(tv->okm_size, GFP_KERNEL); - if (!okm) { - err = -ENOMEM; - goto out_free; - } - - err = crypto_shash_setkey(tfm, tv->prk, tv->prk_size); - if (err) { - pr_err("%s(%s): failed to set prk, error %d\n", - tv->test, driver, err); - goto out_free; - } - - err = hkdf_expand(tfm, tv->info, tv->info_size, - okm, tv->okm_size); - if (err) { - pr_err("%s(%s): hkdf_expand() failed with %d\n", - tv->test, driver, err); - } else if (memcmp(okm, tv->okm, tv->okm_size)) { - pr_err("%s(%s): hkdf_expand() okm mismatch\n", - tv->test, driver); - print_hex_dump(KERN_ERR, "okm: ", DUMP_PREFIX_NONE, - 16, 1, okm, tv->okm_size, false); - err = -EINVAL; - } -out_free: - kfree(okm); - kfree(prk); - crypto_free_shash(tfm); - return err; -} - -static int __init crypto_hkdf_module_init(void) -{ - int ret = 0, i; - - if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS)) - return 0; - - for (i = 0; i < ARRAY_SIZE(hkdf_sha256_tv); i++) { - ret = hkdf_test("hmac(sha256)", &hkdf_sha256_tv[i]); - if (ret) - return ret; - } - for (i = 0; i < ARRAY_SIZE(hkdf_sha384_tv); i++) { - ret = hkdf_test("hmac(sha384)", &hkdf_sha384_tv[i]); - if (ret) - return ret; - } - for (i = 0; i < ARRAY_SIZE(hkdf_sha512_tv); i++) { - ret = hkdf_test("hmac(sha512)", &hkdf_sha512_tv[i]); - if (ret) - return ret; - } - return 0; -} - -static void __exit crypto_hkdf_module_exit(void) {} - -late_initcall(crypto_hkdf_module_init); -module_exit(crypto_hkdf_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("HMAC-based Key Derivation Function (HKDF)"); diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 67a8b352a1d5..187eaf81f0f8 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile @@ -3,7 +3,6 @@ drbd-y := drbd_buildtag.o drbd_bitmap.o drbd_proc.o drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o drbd-y += drbd_main.o drbd_strings.o drbd_nl.o drbd-y += drbd_interval.o drbd_state.o -drbd-y += drbd_nla.o drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 200d464e984b..b1a721dd0496 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -874,7 +874,7 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) if (uuid && uuid != UUID_JUST_CREATED) uuid = uuid + UUID_NEW_BM_OFFSET; else - get_random_bytes(&uuid, sizeof(u64)); + uuid = get_random_u64(); drbd_uuid_set(device, UI_BITMAP, uuid); drbd_print_uuids(device, "updated sync UUID"); drbd_md_sync(device); @@ -3337,7 +3337,7 @@ void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local) u64 val; unsigned long long bm_uuid; - get_random_bytes(&val, sizeof(u64)); + val = get_random_u64(); spin_lock_irq(&device->ldev->md.uuid_lock); bm_uuid = device->ldev->md.uuid[UI_BITMAP]; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e201f0087a0f..c2ac555473e7 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -74,7 +74,15 @@ int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb); int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb); #include -#include "drbd_nla.h" + +static int drbd_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +static void drbd_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); + +#define GENL_MAGIC_FAMILY_PRE_DOIT drbd_pre_doit +#define GENL_MAGIC_FAMILY_POST_DOIT drbd_post_doit + #include static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ @@ -144,18 +152,46 @@ static int drbd_msg_sprintf_info(struct sk_buff *skb, const char *fmt, ...) return 0; } -/* This would be a good candidate for a "pre_doit" hook, - * and per-family private info->pointers. - * But we need to stay compatible with older kernels. - * If it returns successfully, adm_ctx members are valid. - * +/* Flags for drbd_adm_prepare() */ +#define DRBD_ADM_NEED_MINOR (1 << 0) +#define DRBD_ADM_NEED_RESOURCE (1 << 1) +#define DRBD_ADM_NEED_CONNECTION (1 << 2) + +/* Per-command flags for drbd_pre_doit() */ +static const unsigned int drbd_genl_cmd_flags[] = { + [DRBD_ADM_GET_STATUS] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_NEW_MINOR] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_DEL_MINOR] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_NEW_RESOURCE] = 0, + [DRBD_ADM_DEL_RESOURCE] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_RESOURCE_OPTS] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_CONNECT] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_CHG_NET_OPTS] = DRBD_ADM_NEED_CONNECTION, + [DRBD_ADM_DISCONNECT] = DRBD_ADM_NEED_CONNECTION, + [DRBD_ADM_ATTACH] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_CHG_DISK_OPTS] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_RESIZE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_PRIMARY] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_SECONDARY] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_NEW_C_UUID] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_START_OV] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_DETACH] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_INVALIDATE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_INVAL_PEER] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_PAUSE_SYNC] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_RESUME_SYNC] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_SUSPEND_IO] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_RESUME_IO] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_OUTDATE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_GET_TIMEOUT_TYPE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_DOWN] = DRBD_ADM_NEED_RESOURCE, +}; + +/* * At this point, we still rely on the global genl_lock(). * If we want to avoid that, and allow "genl_family.parallel_ops", we may need * to add additional synchronization against object destruction/modification. */ -#define DRBD_ADM_NEED_MINOR 1 -#define DRBD_ADM_NEED_RESOURCE 2 -#define DRBD_ADM_NEED_CONNECTION 4 static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, struct sk_buff *skb, struct genl_info *info, unsigned flags) { @@ -163,8 +199,6 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, const u8 cmd = info->genlhdr->cmd; int err; - memset(adm_ctx, 0, sizeof(*adm_ctx)); - /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) return -EPERM; @@ -204,14 +238,14 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, goto fail; /* and assign stuff to the adm_ctx */ - nla = nested_attr_tb[__nla_type(T_ctx_volume)]; + nla = nested_attr_tb[T_ctx_volume]; if (nla) adm_ctx->volume = nla_get_u32(nla); - nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; + nla = nested_attr_tb[T_ctx_resource_name]; if (nla) adm_ctx->resource_name = nla_data(nla); - adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; - adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + adm_ctx->my_addr = nested_attr_tb[T_ctx_my_addr]; + adm_ctx->peer_addr = nested_attr_tb[T_ctx_peer_addr]; if ((adm_ctx->my_addr && nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || (adm_ctx->peer_addr && @@ -300,9 +334,45 @@ fail: return err; } -static int drbd_adm_finish(struct drbd_config_context *adm_ctx, - struct genl_info *info, int retcode) +static int drbd_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context *adm_ctx; + u8 cmd = info->genlhdr->cmd; + unsigned int flags; + int err; + + adm_ctx = kzalloc_obj(*adm_ctx); + if (!adm_ctx) + return -ENOMEM; + + flags = (cmd < ARRAY_SIZE(drbd_genl_cmd_flags)) + ? drbd_genl_cmd_flags[cmd] : 0; + + err = drbd_adm_prepare(adm_ctx, skb, info, flags); + if (err && !adm_ctx->reply_skb) { + /* Fatal error before reply_skb was allocated. */ + kfree(adm_ctx); + return err; + } + if (err) + adm_ctx->reply_dh->ret_code = err; + + info->user_ptr[0] = adm_ctx; + return 0; +} + +static void drbd_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context *adm_ctx = info->user_ptr[0]; + + if (!adm_ctx) + return; + + if (adm_ctx->reply_skb) + drbd_adm_send_reply(adm_ctx->reply_skb, info); + if (adm_ctx->device) { kref_put(&adm_ctx->device->kref, drbd_destroy_device); adm_ctx->device = NULL; @@ -316,12 +386,7 @@ static int drbd_adm_finish(struct drbd_config_context *adm_ctx, adm_ctx->resource = NULL; } - if (!adm_ctx->reply_skb) - return -ENOMEM; - - adm_ctx->reply_dh->ret_code = retcode; - drbd_adm_send_reply(adm_ctx->reply_skb, info); - return 0; + kfree(adm_ctx); } static void setup_khelper_env(struct drbd_connection *connection, char **envp) @@ -759,22 +824,21 @@ out: static const char *from_attrs_err_to_txt(int err) { return err == -ENOMSG ? "required attribute missing" : - err == -EOPNOTSUPP ? "unknown mandatory attribute" : err == -EEXIST ? "can not change invariant setting" : "invalid attribute value"; } int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct set_role_parms parms; int err; enum drbd_ret_code retcode; enum drbd_state_rv rv; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; @@ -783,24 +847,24 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) err = set_role_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } } genl_unlock(); - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) - rv = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); + rv = drbd_set_role(adm_ctx->device, R_PRIMARY, parms.assume_uptodate); else - rv = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + rv = drbd_set_role(adm_ctx->device, R_SECONDARY, 0); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); genl_lock(); - drbd_adm_finish(&adm_ctx, info, rv); + adm_ctx->reply_dh->ret_code = rv; return 0; out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -1512,7 +1576,7 @@ out: int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct drbd_device *device; struct disk_conf *new_disk_conf, *old_disk_conf; @@ -1520,14 +1584,14 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) int err; unsigned int fifo_size; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - device = adm_ctx.device; - mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx->device; + mutex_lock(&adm_ctx->resource->adm_mutex); /* we also need a disk * to change the options on */ @@ -1551,7 +1615,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs_for_change(new_disk_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail_unlock; } @@ -1577,7 +1641,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (err) { /* Could be just "busy". Ignore? * Introduce dedicated error code? */ - drbd_msg_put_info(adm_ctx.reply_skb, + drbd_msg_put_info(adm_ctx->reply_skb, "Try again without changing current al-extents setting"); retcode = ERR_NOMEM; goto fail_unlock; @@ -1640,9 +1704,9 @@ fail_unlock: success: put_ldev(device); out: - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -1734,7 +1798,7 @@ void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev * int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; struct drbd_peer_device *peer_device; struct drbd_connection *connection; @@ -1751,14 +1815,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) enum drbd_state_rv rv; struct net_conf *nc; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - device = adm_ctx.device; - mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx->device; + mutex_lock(&adm_ctx->resource->adm_mutex); peer_device = first_peer_device(device); connection = peer_device->connection; conn_reconfig_start(connection); @@ -1803,7 +1867,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs(new_disk_conf, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -1954,7 +2018,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_warn(device, "truncating a consistent device during attach (%llu < %llu)\n", nsz, eff); } else { drbd_warn(device, "refusing to truncate a consistent device (%llu < %llu)\n", nsz, eff); - drbd_msg_sprintf_info(adm_ctx.reply_skb, + drbd_msg_sprintf_info(adm_ctx->reply_skb, "To-be-attached device has last effective > current size, and is consistent\n" "(%llu > %llu sectors). Refusing to attach.", eff, nsz); retcode = ERR_IMPLICIT_SHRINK; @@ -2130,8 +2194,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); put_ldev(device); conn_reconfig_done(connection); - mutex_unlock(&adm_ctx.resource->adm_mutex); - drbd_adm_finish(&adm_ctx, info, retcode); + mutex_unlock(&adm_ctx->resource->adm_mutex); + adm_ctx->reply_dh->ret_code = retcode; return 0; force_diskless_dec: @@ -2150,9 +2214,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kfree(new_disk_conf); lc_destroy(resync_lru); kfree(new_plan); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2174,14 +2238,14 @@ static int adm_detach(struct drbd_device *device, int force) * Only then we have finally detached. */ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct detach_parms parms = { }; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; @@ -2189,16 +2253,16 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) err = detach_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } } - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = adm_detach(adm_ctx.device, parms.force_detach); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = adm_detach(adm_ctx->device, parms.force_detach); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2372,7 +2436,7 @@ static void free_crypto(struct crypto *crypto) int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct drbd_connection *connection; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2381,14 +2445,14 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) int rsr; /* re-sync running */ struct crypto crypto = { }; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - connection = adm_ctx.connection; - mutex_lock(&adm_ctx.resource->adm_mutex); + connection = adm_ctx->connection; + mutex_lock(&adm_ctx->resource->adm_mutex); new_net_conf = kzalloc_obj(struct net_conf); if (!new_net_conf) { @@ -2403,7 +2467,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) old_net_conf = connection->net_conf; if (!old_net_conf) { - drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); + drbd_msg_put_info(adm_ctx->reply_skb, "net conf missing, try connect"); retcode = ERR_INVALID_REQUEST; goto fail; } @@ -2415,7 +2479,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs_for_change(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2485,9 +2549,9 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) done: conn_reconfig_done(connection); out: - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2516,7 +2580,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) struct connection_info connection_info; enum drbd_notification_type flags; unsigned int peer_devices = 0; - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; struct crypto crypto = { }; @@ -2527,14 +2591,13 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) int i; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { - drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); + if (!(adm_ctx->my_addr && adm_ctx->peer_addr)) { + drbd_msg_put_info(adm_ctx->reply_skb, "connection endpoint(s) missing"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -2544,15 +2607,15 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) * concurrent reconfiguration/addition/deletion */ for_each_resource(resource, &drbd_resources) { for_each_connection(connection, resource) { - if (nla_len(adm_ctx.my_addr) == connection->my_addr_len && - !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr, + if (nla_len(adm_ctx->my_addr) == connection->my_addr_len && + !memcmp(nla_data(adm_ctx->my_addr), &connection->my_addr, connection->my_addr_len)) { retcode = ERR_LOCAL_ADDR; goto out; } - if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len && - !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr, + if (nla_len(adm_ctx->peer_addr) == connection->peer_addr_len && + !memcmp(nla_data(adm_ctx->peer_addr), &connection->peer_addr, connection->peer_addr_len)) { retcode = ERR_PEER_ADDR; goto out; @@ -2560,8 +2623,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } } - mutex_lock(&adm_ctx.resource->adm_mutex); - connection = first_connection(adm_ctx.resource); + mutex_lock(&adm_ctx->resource->adm_mutex); + connection = first_connection(adm_ctx->resource); conn_reconfig_start(connection); if (connection->cstate > C_STANDALONE) { @@ -2581,7 +2644,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2597,11 +2660,11 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) drbd_flush_workqueue(&connection->sender_work); - mutex_lock(&adm_ctx.resource->conf_update); + mutex_lock(&adm_ctx->resource->conf_update); old_net_conf = connection->net_conf; if (old_net_conf) { retcode = ERR_NET_CONFIGURED; - mutex_unlock(&adm_ctx.resource->conf_update); + mutex_unlock(&adm_ctx->resource->conf_update); goto fail; } rcu_assign_pointer(connection->net_conf, new_net_conf); @@ -2612,10 +2675,10 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) connection->csums_tfm = crypto.csums_tfm; connection->verify_tfm = crypto.verify_tfm; - connection->my_addr_len = nla_len(adm_ctx.my_addr); - memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len); - connection->peer_addr_len = nla_len(adm_ctx.peer_addr); - memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); + connection->my_addr_len = nla_len(adm_ctx->my_addr); + memcpy(&connection->my_addr, nla_data(adm_ctx->my_addr), connection->my_addr_len); + connection->peer_addr_len = nla_len(adm_ctx->peer_addr); + memcpy(&connection->peer_addr, nla_data(adm_ctx->peer_addr), connection->peer_addr_len); idr_for_each_entry(&connection->peer_devices, peer_device, i) { peer_devices++; @@ -2633,7 +2696,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags); } mutex_unlock(¬ification_mutex); - mutex_unlock(&adm_ctx.resource->conf_update); + mutex_unlock(&adm_ctx->resource->conf_update); rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, i) { @@ -2646,8 +2709,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) rv = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); - mutex_unlock(&adm_ctx.resource->adm_mutex); - drbd_adm_finish(&adm_ctx, info, rv); + mutex_unlock(&adm_ctx->resource->adm_mutex); + adm_ctx->reply_dh->ret_code = rv; return 0; fail: @@ -2655,9 +2718,9 @@ fail: kfree(new_net_conf); conn_reconfig_done(connection); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2729,40 +2792,40 @@ repeat: int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct disconnect_parms parms; struct drbd_connection *connection; enum drbd_state_rv rv; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto fail; - connection = adm_ctx.connection; + connection = adm_ctx->connection; memset(&parms, 0, sizeof(parms)); if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { err = disconnect_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); if (rv < SS_SUCCESS) { - drbd_adm_finish(&adm_ctx, info, rv); + adm_ctx->reply_dh->ret_code = rv; return 0; } retcode = NO_ERROR; fail: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2784,7 +2847,7 @@ void resync_after_online_grow(struct drbd_device *device) int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct disk_conf *old_disk_conf, *new_disk_conf = NULL; struct resize_parms rs; struct drbd_device *device; @@ -2795,14 +2858,14 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) sector_t u_size; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - mutex_lock(&adm_ctx.resource->adm_mutex); - device = adm_ctx.device; + mutex_lock(&adm_ctx->resource->adm_mutex); + device = adm_ctx->device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; goto fail; @@ -2815,7 +2878,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) err = resize_parms_from_attrs(&rs, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail_ldev; } } @@ -2907,9 +2970,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } fail: - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; fail_ldev: @@ -2920,61 +2983,61 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto fail; - res_opts = adm_ctx.resource->res_opts; + res_opts = adm_ctx->resource->res_opts; if (should_set_defaults(info)) set_res_opts_defaults(&res_opts); err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } - mutex_lock(&adm_ctx.resource->adm_mutex); - err = set_resource_options(adm_ctx.resource, &res_opts); + mutex_lock(&adm_ctx->resource->adm_mutex); + err = set_resource_options(adm_ctx->resource, &res_opts); if (err) { retcode = ERR_INVALID_REQUEST; if (err == -ENOMEM) retcode = ERR_NOMEM; } - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); fail: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - device = adm_ctx.device; + device = adm_ctx->device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; goto out; } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -2997,30 +3060,30 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); put_ldev(device); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, union drbd_state mask, union drbd_state val) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = drbd_request_state(adm_ctx.device, mask, val); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = drbd_request_state(adm_ctx->device, mask, val); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -3036,23 +3099,23 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device, int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; int retcode; /* drbd_ret_code, drbd_state_rv */ struct drbd_device *device; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - device = adm_ctx.device; + device = adm_ctx->device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; goto out; } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -3078,48 +3141,48 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); put_ldev(device); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + mutex_lock(&adm_ctx->resource->adm_mutex); + if (drbd_request_state(adm_ctx->device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) retcode = ERR_PAUSE_IS_SET; - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; union drbd_dev_state s; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { - s = adm_ctx.device->state; + mutex_lock(&adm_ctx->resource->adm_mutex); + if (drbd_request_state(adm_ctx->device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { + s = adm_ctx->device->state; if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; @@ -3127,9 +3190,9 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) retcode = ERR_PAUSE_IS_CLEAR; } } - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -3140,18 +3203,18 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - device = adm_ctx.device; + mutex_lock(&adm_ctx->resource->adm_mutex); + device = adm_ctx->device; if (test_bit(NEW_CUR_UUID, &device->flags)) { if (get_ldev_if_state(device, D_ATTACHING)) { drbd_uuid_new_current(device); @@ -3173,7 +3236,7 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) * matching real data uuid exists). */ u64 val; - get_random_bytes(&val, sizeof(u64)); + val = get_random_u64(); drbd_set_ed_uuid(device, val); drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n"); } @@ -3188,9 +3251,9 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); } drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -3238,14 +3301,13 @@ nla_put_failure: static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr) { const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; - const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; struct nlattr *nla; nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), DRBD_NLA_CFG_CONTEXT); if (!nla) return NULL; - return drbd_nla_find_nested(maxtype, nla, __nla_type(attr)); + return nla_find_nested(nla, attr); } static void resource_to_info(struct resource_info *, struct drbd_resource *); @@ -3378,8 +3440,10 @@ int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) if (resource_filter) { retcode = ERR_RES_NOT_KNOWN; resource = drbd_find_resource(nla_data(resource_filter)); - if (!resource) + if (!resource) { + rcu_read_lock(); goto put_result; + } cb->args[0] = (long)resource; } } @@ -3628,8 +3692,10 @@ int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb) if (resource_filter) { retcode = ERR_RES_NOT_KNOWN; resource = drbd_find_resource(nla_data(resource_filter)); - if (!resource) + if (!resource) { + rcu_read_lock(); goto put_result; + } } cb->args[0] = (long)resource; } @@ -3843,23 +3909,24 @@ nla_put_failure: int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL); + err = nla_put_status_info(adm_ctx->reply_skb, adm_ctx->device, NULL); if (err) { - nlmsg_free(adm_ctx.reply_skb); + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -3998,7 +4065,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) struct nlattr *nla; const char *resource_name; struct drbd_resource *resource; - int maxtype; /* Is this a followup call? */ if (cb->args[0]) { @@ -4018,10 +4084,7 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) /* No explicit context given. Dump all. */ if (!nla) goto dump; - maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; - nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); - if (IS_ERR(nla)) - return PTR_ERR(nla); + nla = nla_find_nested(nla, T_ctx_resource_name); /* context given, but no name present? */ if (!nla) return -EINVAL; @@ -4046,46 +4109,47 @@ dump: int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct timeout_parms tp; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; tp.timeout_type = - adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : - test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED : + adm_ctx->device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &adm_ctx->device->flags) ? UT_DEGRADED : UT_DEFAULT; - err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); + err = timeout_parms_to_priv_skb(adm_ctx->reply_skb, &tp); if (err) { - nlmsg_free(adm_ctx.reply_skb); + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; enum drbd_ret_code retcode; struct start_ov_parms parms; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - device = adm_ctx.device; + device = adm_ctx->device; /* resume from last known position, if possible */ parms.ov_start_sector = device->ov_start_sector; @@ -4094,11 +4158,11 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) int err = start_ov_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); /* w_make_ov_request expects position to be aligned */ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); @@ -4111,40 +4175,40 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; enum drbd_ret_code retcode; int skip_initial_sync = 0; int err; struct new_c_uuid_parms args; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out_nolock; - device = adm_ctx.device; + device = adm_ctx->device; memset(&args, 0, sizeof(args)); if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { err = new_c_uuid_parms_from_attrs(&args, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out_nolock; } } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(device)) { @@ -4189,9 +4253,9 @@ out_dec: put_ldev(device); out: mutex_unlock(device->state_mutex); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out_nolock: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4224,14 +4288,14 @@ static void resource_to_info(struct resource_info *info, int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { struct drbd_connection *connection; - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; @@ -4239,18 +4303,18 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } - retcode = drbd_check_resource_name(&adm_ctx); + retcode = drbd_check_resource_name(adm_ctx); if (retcode != NO_ERROR) goto out; - if (adm_ctx.resource) { + if (adm_ctx->resource) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { retcode = ERR_INVALID_REQUEST; - drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); + drbd_msg_put_info(adm_ctx->reply_skb, "resource exists"); } /* else: still NO_ERROR */ goto out; @@ -4258,7 +4322,7 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) /* not yet safe for genl_family.parallel_ops */ mutex_lock(&resources_mutex); - connection = conn_create(adm_ctx.resource_name, &res_opts); + connection = conn_create(adm_ctx->resource_name, &res_opts); mutex_unlock(&resources_mutex); if (connection) { @@ -4273,7 +4337,7 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) retcode = ERR_NOMEM; out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4286,38 +4350,38 @@ static void device_to_info(struct device_info *info, int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_genlmsghdr *dh = genl_info_userhdr(info); enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; if (dh->minor > MINORMASK) { - drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor out of range"); retcode = ERR_INVALID_REQUEST; goto out; } - if (adm_ctx.volume > DRBD_VOLUME_MAX) { - drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); + if (adm_ctx->volume > DRBD_VOLUME_MAX) { + drbd_msg_put_info(adm_ctx->reply_skb, "requested volume id out of range"); retcode = ERR_INVALID_REQUEST; goto out; } /* drbd_adm_prepare made sure already * that first_peer_device(device)->connection and device->vnr match the request. */ - if (adm_ctx.device) { + if (adm_ctx->device) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) retcode = ERR_MINOR_OR_VOLUME_EXISTS; /* else: still NO_ERROR */ goto out; } - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = drbd_create_device(adm_ctx, dh->minor); if (retcode == NO_ERROR) { struct drbd_device *device; struct drbd_peer_device *peer_device; @@ -4348,9 +4412,9 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) } mutex_unlock(¬ification_mutex); } - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4393,20 +4457,20 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device) int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = adm_del_minor(adm_ctx.device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = adm_del_minor(adm_ctx->device); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4442,20 +4506,20 @@ static int adm_del_resource(struct drbd_resource *resource) int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_resource *resource; struct drbd_connection *connection; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ unsigned i; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - resource = adm_ctx.resource; + resource = adm_ctx->resource; mutex_lock(&resource->adm_mutex); /* demote */ for_each_connection(connection, resource) { @@ -4464,14 +4528,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&connection->peer_devices, peer_device, i) { retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to demote"); goto out; } } retcode = conn_try_disconnect(connection, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to disconnect"); goto out; } } @@ -4480,7 +4544,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&resource->devices, device, i) { retcode = adm_detach(device, 0); if (retcode < SS_SUCCESS || retcode > NO_ERROR) { - drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to detach"); goto out; } } @@ -4490,7 +4554,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) retcode = adm_del_minor(device); if (retcode != NO_ERROR) { /* "can not happen" */ - drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to delete volume"); goto out; } } @@ -4499,28 +4563,28 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) out: mutex_unlock(&resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_resource *resource; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - resource = adm_ctx.resource; + resource = adm_ctx->resource; mutex_lock(&resource->adm_mutex); retcode = adm_del_resource(resource); mutex_unlock(&resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c deleted file mode 100644 index df0d241d3f6a..000000000000 --- a/drivers/block/drbd/drbd_nla.c +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include "drbd_nla.h" - -static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) -{ - struct nlattr *head = nla_data(nla); - int len = nla_len(nla); - int rem; - - /* - * validate_nla (called from nla_parse_nested) ignores attributes - * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. - * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY - * flag set also, check and remove that flag before calling - * nla_parse_nested. - */ - - nla_for_each_attr(nla, head, len, rem) { - if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { - nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; - if (nla_type(nla) > maxtype) - return -EOPNOTSUPP; - } - } - return 0; -} - -int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, - const struct nla_policy *policy) -{ - int err; - - err = drbd_nla_check_mandatory(maxtype, nla); - if (!err) - err = nla_parse_nested_deprecated(tb, maxtype, nla, policy, - NULL); - - return err; -} - -struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) -{ - int err; - /* - * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and - * we don't know about that attribute, reject all the nested - * attributes. - */ - err = drbd_nla_check_mandatory(maxtype, nla); - if (err) - return ERR_PTR(err); - return nla_find_nested(nla, attrtype); -} diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h deleted file mode 100644 index d3555df0d353..000000000000 --- a/drivers/block/drbd/drbd_nla.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef __DRBD_NLA_H -#define __DRBD_NLA_H - -extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, - const struct nla_policy *policy); -extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); - -#endif /* __DRBD_NLA_H */ diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 63aeb7a76a8c..603a98a30989 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -46,6 +46,8 @@ #include #include #include +#include +#include #include #include @@ -58,6 +60,11 @@ #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) #define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV) +#define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF) +#define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF) + +/* Default max shmem buffer size: 4GB (may be increased in future) */ +#define UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -81,7 +88,8 @@ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ | UBLK_F_SAFE_STOP_DEV \ | UBLK_F_BATCH_IO \ - | UBLK_F_NO_AUTO_PART_SCAN) + | UBLK_F_NO_AUTO_PART_SCAN \ + | UBLK_F_SHMEM_ZC) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -289,6 +297,13 @@ struct ublk_queue { struct ublk_io ios[] __counted_by(q_depth); }; +/* Maple tree value: maps a PFN range to buffer location */ +struct ublk_buf_range { + unsigned short buf_index; + unsigned short flags; + unsigned int base_offset; /* byte offset within buffer */ +}; + struct ublk_device { struct gendisk *ub_disk; @@ -323,6 +338,10 @@ struct ublk_device { bool block_open; /* protected by open_mutex */ + /* shared memory zero copy */ + struct maple_tree buf_tree; + struct ida buf_ida; + struct ublk_queue *queues[]; }; @@ -334,6 +353,9 @@ struct ublk_params_header { static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); +static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq, + u32 *buf_idx, u32 *buf_off); +static void ublk_buf_cleanup(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io); @@ -398,6 +420,22 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; } +static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_SHMEM_ZC; +} + +static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq, + unsigned int tag) +{ + return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC; +} + +static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_SHMEM_ZC; +} + static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_AUTO_BUF_REG; @@ -808,7 +846,7 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) static int ublk_integrity_flags(u32 flags) { - int ret_flags = 0; + int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE; if (flags & LBMD_PI_CAP_INTEGRITY) { flags &= ~LBMD_PI_CAP_INTEGRITY; @@ -1460,6 +1498,19 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); + + /* Try shmem zero-copy match before setting addr */ + if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) { + u32 buf_idx, buf_off; + + if (ublk_try_buf_match(ubq->dev, req, + &buf_idx, &buf_off)) { + iod->op_flags |= UBLK_IO_F_SHMEM_ZC; + iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off); + return BLK_STS_OK; + } + } + iod->addr = io->buf.addr; return BLK_STS_OK; @@ -1505,6 +1556,10 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, req_op(req) != REQ_OP_DRV_IN) goto exit; + /* shmem zero copy: no data to unmap, pages already shared */ + if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag)) + goto exit; + /* for READ request, writing data in iod->addr to rq buffers */ unmapped_bytes = ublk_unmap_io(need_map, req, io); @@ -1663,7 +1718,13 @@ static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq, static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, struct ublk_io *io) { - unsigned mapped_bytes = ublk_map_io(ubq, req, io); + unsigned mapped_bytes; + + /* shmem zero copy: skip data copy, pages already shared */ + if (ublk_iod_is_shmem_zc(ubq, req->tag)) + return true; + + mapped_bytes = ublk_map_io(ubq, req, io); /* partially mapped, update io descriptor */ if (unlikely(mapped_bytes != blk_rq_bytes(req))) { @@ -1789,7 +1850,7 @@ static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq, * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf. * Returns the new length after filtering. */ -static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, +static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, unsigned int len) { unsigned int i, j; @@ -1805,6 +1866,41 @@ static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, return j; } +static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short *tag_buf, size_t len, int ret) +{ + int i, res; + + /* + * Undo prep state for all IOs since userspace never received them. + * This restores IOs to pre-prepared state so they can be cleanly + * re-prepared when tags are pulled from FIFO again. + */ + for (i = 0; i < len; i++) { + struct ublk_io *io = &ubq->ios[tag_buf[i]]; + int index = -1; + + ublk_io_lock(io); + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) + index = io->buf.auto_reg.index; + io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); + io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_io_unlock(io); + + if (index != -1) + io_buffer_unregister_bvec(data->cmd, index, + data->issue_flags); + } + + res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, + tag_buf, len, &ubq->evts_lock); + + pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " + "tags(%d %zu) ret %d\n", __func__, res, len, + ret); +} + #define MAX_NR_TAG 128 static int __ublk_batch_dispatch(struct ublk_queue *ubq, const struct ublk_batch_io_data *data, @@ -1848,37 +1944,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz); ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags); - if (unlikely(ret < 0)) { - int i, res; - - /* - * Undo prep state for all IOs since userspace never received them. - * This restores IOs to pre-prepared state so they can be cleanly - * re-prepared when tags are pulled from FIFO again. - */ - for (i = 0; i < len; i++) { - struct ublk_io *io = &ubq->ios[tag_buf[i]]; - int index = -1; - - ublk_io_lock(io); - if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) - index = io->buf.auto_reg.index; - io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); - io->flags |= UBLK_IO_FLAG_ACTIVE; - ublk_io_unlock(io); - - if (index != -1) - io_buffer_unregister_bvec(data->cmd, index, - data->issue_flags); - } - - res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, - tag_buf, len, &ubq->evts_lock); - - pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " - "tags(%d %zu) ret %d\n", __func__, res, len, - ret); - } + if (unlikely(ret < 0)) + ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret); return ret; } @@ -2910,22 +2977,26 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_cancel_dev(ub); } +static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io) +{ + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + io->flags &= ~UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); +} + /* reset per-queue io flags */ static void ublk_queue_reset_io_flags(struct ublk_queue *ubq) { - int j; - - /* UBLK_IO_FLAG_CANCELED can be cleared now */ spin_lock(&ubq->cancel_lock); - for (j = 0; j < ubq->q_depth; j++) - ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; ubq->canceling = false; spin_unlock(&ubq->cancel_lock); ubq->fail_io = false; } /* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id) +static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id, + struct ublk_io *io) __must_hold(&ub->mutex) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); @@ -2934,6 +3005,7 @@ static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id) ub->unprivileged_daemons = true; ubq->nr_io_ready++; + ublk_reset_io_flags(ubq, io); /* Check if this specific queue is now fully ready */ if (ublk_queue_ready(ubq)) { @@ -3196,7 +3268,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, if (!ret) ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); if (!ret) - ublk_mark_io_ready(ub, q_id); + ublk_mark_io_ready(ub, q_id, io); mutex_unlock(&ub->mutex); return ret; } @@ -3604,7 +3676,7 @@ static int ublk_batch_prep_io(struct ublk_queue *ubq, ublk_io_unlock(io); if (!ret) - ublk_mark_io_ready(data->ub, ubq->q_id); + ublk_mark_io_ready(data->ub, ubq->q_id, io); return ret; } @@ -4200,6 +4272,7 @@ static void ublk_cdev_rel(struct device *dev) { struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); + ublk_buf_cleanup(ub); blk_mq_free_tag_set(&ub->tag_set); ublk_deinit_queues(ub); ublk_free_dev_number(ub); @@ -4621,6 +4694,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) mutex_init(&ub->mutex); spin_lock_init(&ub->lock); mutex_init(&ub->cancel_mutex); + mt_init(&ub->buf_tree); + ida_init(&ub->buf_ida); INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work); ret = ublk_alloc_dev_number(ub, header->dev_id); @@ -5171,6 +5246,314 @@ exit: return err; } +/* + * Lock for maple tree modification: acquire ub->mutex, then freeze queue + * if device is started. If device is not yet started, only mutex is + * needed since no I/O path can access the tree. + * + * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked() + * already holds ub->mutex when calling del_gendisk() which freezes the queue. +*/ +static unsigned int ublk_lock_buf_tree(struct ublk_device *ub) +{ + unsigned int memflags = 0; + + mutex_lock(&ub->mutex); + if (ub->ub_disk) + memflags = blk_mq_freeze_queue(ub->ub_disk->queue); + + return memflags; +} + +static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags) +{ + if (ub->ub_disk) + blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags); + mutex_unlock(&ub->mutex); +} + +/* Erase coalesced PFN ranges from the maple tree matching buf_index */ +static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index) +{ + MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); + struct ublk_buf_range *range; + + mas_lock(&mas); + mas_for_each(&mas, range, ULONG_MAX) { + if (range->buf_index == buf_index) { + mas_erase(&mas); + kfree(range); + } + } + mas_unlock(&mas); +} + +static int __ublk_ctrl_reg_buf(struct ublk_device *ub, + struct page **pages, unsigned long nr_pages, + int index, unsigned short flags) +{ + unsigned long i; + int ret; + + for (i = 0; i < nr_pages; i++) { + unsigned long pfn = page_to_pfn(pages[i]); + unsigned long start = i; + struct ublk_buf_range *range; + + /* Find run of consecutive PFNs */ + while (i + 1 < nr_pages && + page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1) + i++; + + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + ret = -ENOMEM; + goto unwind; + } + range->buf_index = index; + range->flags = flags; + range->base_offset = start << PAGE_SHIFT; + + ret = mtree_insert_range(&ub->buf_tree, pfn, + pfn + (i - start), + range, GFP_KERNEL); + if (ret) { + kfree(range); + goto unwind; + } + } + return 0; + +unwind: + ublk_buf_erase_ranges(ub, index); + return ret; +} + +/* + * Register a shared memory buffer for zero-copy I/O. + * Pins pages, builds PFN maple tree, freezes/unfreezes the queue + * internally. Returns buffer index (>= 0) on success. + */ +static int ublk_ctrl_reg_buf(struct ublk_device *ub, + struct ublksrv_ctrl_cmd *header) +{ + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_shmem_buf_reg buf_reg; + unsigned long nr_pages; + struct page **pages = NULL; + unsigned int gup_flags; + unsigned int memflags; + long pinned; + int index; + int ret; + + if (!ublk_dev_support_shmem_zc(ub)) + return -EOPNOTSUPP; + + memset(&buf_reg, 0, sizeof(buf_reg)); + if (copy_from_user(&buf_reg, argp, + min_t(size_t, header->len, sizeof(buf_reg)))) + return -EFAULT; + + if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY) + return -EINVAL; + + if (buf_reg.reserved) + return -EINVAL; + + if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX || + !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr)) + return -EINVAL; + + nr_pages = buf_reg.len >> PAGE_SHIFT; + + /* Pin pages before any locks (may sleep) */ + pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + gup_flags = FOLL_LONGTERM; + if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY)) + gup_flags |= FOLL_WRITE; + + pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages); + if (pinned < 0) { + ret = pinned; + goto err_free_pages; + } + if (pinned != nr_pages) { + ret = -EFAULT; + goto err_unpin; + } + + memflags = ublk_lock_buf_tree(ub); + + index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL); + if (index < 0) { + ret = index; + goto err_unlock; + } + + ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags); + if (ret) { + ida_free(&ub->buf_ida, index); + goto err_unlock; + } + + ublk_unlock_buf_tree(ub, memflags); + kvfree(pages); + return index; + +err_unlock: + ublk_unlock_buf_tree(ub, memflags); +err_unpin: + unpin_user_pages(pages, pinned); +err_free_pages: + kvfree(pages); + return ret; +} + +static int __ublk_ctrl_unreg_buf(struct ublk_device *ub, int buf_index) +{ + MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); + struct ublk_buf_range *range; + struct page *pages[32]; + int ret = -ENOENT; + + mas_lock(&mas); + mas_for_each(&mas, range, ULONG_MAX) { + unsigned long base, nr, off; + + if (range->buf_index != buf_index) + continue; + + ret = 0; + base = mas.index; + nr = mas.last - base + 1; + mas_erase(&mas); + + for (off = 0; off < nr; ) { + unsigned int batch = min_t(unsigned long, + nr - off, 32); + unsigned int j; + + for (j = 0; j < batch; j++) + pages[j] = pfn_to_page(base + off + j); + unpin_user_pages(pages, batch); + off += batch; + } + kfree(range); + } + mas_unlock(&mas); + + return ret; +} + +static int ublk_ctrl_unreg_buf(struct ublk_device *ub, + struct ublksrv_ctrl_cmd *header) +{ + int index = (int)header->data[0]; + unsigned int memflags; + int ret; + + if (!ublk_dev_support_shmem_zc(ub)) + return -EOPNOTSUPP; + + if (index < 0 || index > USHRT_MAX) + return -EINVAL; + + memflags = ublk_lock_buf_tree(ub); + + ret = __ublk_ctrl_unreg_buf(ub, index); + if (!ret) + ida_free(&ub->buf_ida, index); + + ublk_unlock_buf_tree(ub, memflags); + return ret; +} + +static void ublk_buf_cleanup(struct ublk_device *ub) +{ + MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); + struct ublk_buf_range *range; + struct page *pages[32]; + + mas_for_each(&mas, range, ULONG_MAX) { + unsigned long base = mas.index; + unsigned long nr = mas.last - base + 1; + unsigned long off; + + for (off = 0; off < nr; ) { + unsigned int batch = min_t(unsigned long, + nr - off, 32); + unsigned int j; + + for (j = 0; j < batch; j++) + pages[j] = pfn_to_page(base + off + j); + unpin_user_pages(pages, batch); + off += batch; + } + kfree(range); + } + mtree_destroy(&ub->buf_tree); + ida_destroy(&ub->buf_ida); +} + +/* Check if request pages match a registered shared memory buffer */ +static bool ublk_try_buf_match(struct ublk_device *ub, + struct request *rq, + u32 *buf_idx, u32 *buf_off) +{ + struct req_iterator iter; + struct bio_vec bv; + int index = -1; + unsigned long expected_offset = 0; + bool first = true; + + rq_for_each_bvec(bv, rq, iter) { + unsigned long pfn = page_to_pfn(bv.bv_page); + unsigned long end_pfn = pfn + + ((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT); + struct ublk_buf_range *range; + unsigned long off; + MA_STATE(mas, &ub->buf_tree, pfn, pfn); + + range = mas_walk(&mas); + if (!range) + return false; + + /* verify all pages in this bvec fall within the range */ + if (end_pfn > mas.last) + return false; + + off = range->base_offset + + (pfn - mas.index) * PAGE_SIZE + bv.bv_offset; + + if (first) { + /* Read-only buffer can't serve READ (kernel writes) */ + if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) && + req_op(rq) != REQ_OP_WRITE) + return false; + index = range->buf_index; + expected_offset = off; + *buf_off = off; + first = false; + } else { + if (range->buf_index != index) + return false; + if (off != expected_offset) + return false; + } + expected_offset += bv.bv_len; + } + + if (first) + return false; + + *buf_idx = index; + return true; +} + static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, u32 cmd_op, struct ublksrv_ctrl_cmd *header) { @@ -5228,6 +5611,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_UPDATE_SIZE: case UBLK_CMD_QUIESCE_DEV: case UBLK_CMD_TRY_STOP_DEV: + case UBLK_CMD_REG_BUF: + case UBLK_CMD_UNREG_BUF: mask = MAY_READ | MAY_WRITE; break; default: @@ -5352,6 +5737,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_TRY_STOP_DEV: ret = ublk_ctrl_try_stop_dev(ub); break; + case UBLK_CMD_REG_BUF: + ret = ublk_ctrl_reg_buf(ub, &header); + break; + case UBLK_CMD_UNREG_BUF: + ret = ublk_ctrl_unreg_buf(ub, &header); + break; default: ret = -EOPNOTSUPP; break; diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 51c043342127..8baf642037fd 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Options for adding (and removing) a device. @@ -34,6 +35,8 @@ enum { ZLOOP_OPT_BUFFERED_IO = (1 << 8), ZLOOP_OPT_ZONE_APPEND = (1 << 9), ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), + ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11), + ZLOOP_OPT_MAX_OPEN_ZONES = (1 << 12), }; static const match_table_t zloop_opt_tokens = { @@ -48,6 +51,8 @@ static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, + { ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" }, + { ZLOOP_OPT_MAX_OPEN_ZONES, "max_open_zones=%u" }, { ZLOOP_OPT_ERR, NULL } }; @@ -56,6 +61,7 @@ static const match_table_t zloop_opt_tokens = { #define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT) #define ZLOOP_DEF_NR_ZONES 64 #define ZLOOP_DEF_NR_CONV_ZONES 8 +#define ZLOOP_DEF_MAX_OPEN_ZONES 0 #define ZLOOP_DEF_BASE_DIR "/var/local/zloop" #define ZLOOP_DEF_NR_QUEUES 1 #define ZLOOP_DEF_QUEUE_DEPTH 128 @@ -73,12 +79,14 @@ struct zloop_options { sector_t zone_size; sector_t zone_capacity; unsigned int nr_conv_zones; + unsigned int max_open_zones; char *base_dir; unsigned int nr_queues; unsigned int queue_depth; bool buffered_io; bool zone_append; bool ordered_zone_append; + bool discard_write_cache; }; /* @@ -95,7 +103,12 @@ enum zloop_zone_flags { ZLOOP_ZONE_SEQ_ERROR, }; +/* + * Zone descriptor. + * Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock + */ struct zloop_zone { + struct list_head open_zone_entry; struct file *file; unsigned long flags; @@ -119,6 +132,7 @@ struct zloop_device { bool buffered_io; bool zone_append; bool ordered_zone_append; + bool discard_write_cache; const char *base_dir; struct file *data_dir; @@ -128,8 +142,13 @@ struct zloop_device { sector_t zone_capacity; unsigned int nr_zones; unsigned int nr_conv_zones; + unsigned int max_open_zones; unsigned int block_size; + spinlock_t open_zones_lock; + struct list_head open_zones_lru_list; + unsigned int nr_open_zones; + struct zloop_zone zones[] __counted_by(nr_zones); }; @@ -153,6 +172,122 @@ static unsigned int rq_zone_no(struct request *rq) return blk_rq_pos(rq) >> zlo->zone_shift; } +/* + * Open an already open zone. This is mostly a no-op, except for the imp open -> + * exp open condition change that may happen. We also move a zone at the tail of + * the list of open zones so that if we need to + * implicitly close one open zone, we can do so in LRU order. + */ +static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo, + struct zloop_zone *zone) +{ + if (zlo->max_open_zones) { + spin_lock(&zlo->open_zones_lock); + list_move_tail(&zone->open_zone_entry, + &zlo->open_zones_lru_list); + spin_unlock(&zlo->open_zones_lock); + } +} + +static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo, + struct zloop_zone *zone) +{ + if (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN) { + spin_lock(&zlo->open_zones_lock); + list_del_init(&zone->open_zone_entry); + zlo->nr_open_zones--; + spin_unlock(&zlo->open_zones_lock); + } +} + +static inline bool zloop_can_open_zone(struct zloop_device *zlo) +{ + return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones; +} + +/* + * If we have reached the maximum open zones limit, attempt to close an + * implicitly open zone (if we have any) so that we can implicitly open another + * zone without exceeding the maximum number of open zones. + */ +static bool zloop_close_imp_open_zone(struct zloop_device *zlo) +{ + struct zloop_zone *zone; + + lockdep_assert_held(&zlo->open_zones_lock); + + if (zloop_can_open_zone(zlo)) + return true; + + list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) { + if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { + zone->cond = BLK_ZONE_COND_CLOSED; + list_del_init(&zone->open_zone_entry); + zlo->nr_open_zones--; + return true; + } + } + + return false; +} + +static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo, + struct zloop_zone *zone, + bool explicit) +{ + spin_lock(&zlo->open_zones_lock); + + if (explicit) { + /* + * Explicit open: we cannot allow this if we have reached the + * maximum open zones limit. + */ + if (!zloop_can_open_zone(zlo)) + goto fail; + zone->cond = BLK_ZONE_COND_EXP_OPEN; + } else { + /* + * Implicit open case: if we have reached the maximum open zones + * limit, try to close an implicitly open zone first. + */ + if (!zloop_close_imp_open_zone(zlo)) + goto fail; + zone->cond = BLK_ZONE_COND_IMP_OPEN; + } + + zlo->nr_open_zones++; + list_add_tail(&zone->open_zone_entry, + &zlo->open_zones_lru_list); + + spin_unlock(&zlo->open_zones_lock); + + return true; + +fail: + spin_unlock(&zlo->open_zones_lock); + + return false; +} + +static bool zloop_do_open_zone(struct zloop_device *zlo, + struct zloop_zone *zone, bool explicit) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + if (explicit) + zone->cond = BLK_ZONE_COND_EXP_OPEN; + zloop_lru_rotate_open_zone(zlo, zone); + return true; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_CLOSED: + return zloop_open_closed_or_empty_zone(zlo, zone, explicit); + default: + return false; + } +} + static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; @@ -186,13 +321,17 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) spin_lock_irqsave(&zone->wp_lock, flags); if (!file_sectors) { + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; } else if (file_sectors == zlo->zone_capacity) { + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; } else { - zone->cond = BLK_ZONE_COND_CLOSED; + if (zone->cond != BLK_ZONE_COND_IMP_OPEN && + zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_CLOSED; zone->wp = zone->start + file_sectors; } spin_unlock_irqrestore(&zone->wp_lock, flags); @@ -216,19 +355,8 @@ static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no) goto unlock; } - switch (zone->cond) { - case BLK_ZONE_COND_EXP_OPEN: - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_CLOSED: - case BLK_ZONE_COND_IMP_OPEN: - zone->cond = BLK_ZONE_COND_EXP_OPEN; - break; - case BLK_ZONE_COND_FULL: - default: + if (!zloop_do_open_zone(zlo, zone, true)) ret = -EIO; - break; - } unlock: mutex_unlock(&zone->lock); @@ -259,6 +387,7 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: spin_lock_irqsave(&zone->wp_lock, flags); + zloop_lru_remove_open_zone(zlo, zone); if (zone->wp == zone->start) zone->cond = BLK_ZONE_COND_EMPTY; else @@ -300,6 +429,7 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) } spin_lock_irqsave(&zone->wp_lock, flags); + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); @@ -347,6 +477,7 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) } spin_lock_irqsave(&zone->wp_lock, flags); + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); @@ -378,125 +509,22 @@ static void zloop_rw_complete(struct kiocb *iocb, long ret) zloop_put_cmd(cmd); } -static void zloop_rw(struct zloop_cmd *cmd) +static int zloop_do_rw(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); + int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE; + unsigned int nr_bvec = blk_rq_nr_bvec(rq); struct zloop_device *zlo = rq->q->queuedata; - unsigned int zone_no = rq_zone_no(rq); - sector_t sector = blk_rq_pos(rq); - sector_t nr_sectors = blk_rq_sectors(rq); - bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; - bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; - int rw = is_write ? ITER_SOURCE : ITER_DEST; + struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)]; struct req_iterator rq_iter; - struct zloop_zone *zone; struct iov_iter iter; - struct bio_vec tmp; - unsigned long flags; - sector_t zone_end; - unsigned int nr_bvec; - int ret; - - atomic_set(&cmd->ref, 2); - cmd->sector = sector; - cmd->nr_sectors = nr_sectors; - cmd->ret = 0; - - if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { - ret = -EIO; - goto out; - } - - /* We should never get an I/O beyond the device capacity. */ - if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { - ret = -EIO; - goto out; - } - zone = &zlo->zones[zone_no]; - zone_end = zone->start + zlo->zone_capacity; - - /* - * The block layer should never send requests that are not fully - * contained within the zone. - */ - if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { - ret = -EIO; - goto out; - } - - if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { - mutex_lock(&zone->lock); - ret = zloop_update_seq_zone(zlo, zone_no); - mutex_unlock(&zone->lock); - if (ret) - goto out; - } - - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { - mutex_lock(&zone->lock); - - spin_lock_irqsave(&zone->wp_lock, flags); - - /* - * Zone append operations always go at the current write - * pointer, but regular write operations must already be - * aligned to the write pointer when submitted. - */ - if (is_append) { - /* - * If ordered zone append is in use, we already checked - * and set the target sector in zloop_queue_rq(). - */ - if (!zlo->ordered_zone_append) { - if (zone->cond == BLK_ZONE_COND_FULL || - zone->wp + nr_sectors > zone_end) { - spin_unlock_irqrestore(&zone->wp_lock, - flags); - ret = -EIO; - goto unlock; - } - sector = zone->wp; - } - cmd->sector = sector; - } else if (sector != zone->wp) { - spin_unlock_irqrestore(&zone->wp_lock, flags); - pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", - zone_no, sector, zone->wp); - ret = -EIO; - goto unlock; - } - - /* Implicitly open the target zone. */ - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - /* - * Advance the write pointer, unless ordered zone append is in - * use. If the write fails, the write pointer position will be - * corrected when the next I/O starts execution. - */ - if (!is_append || !zlo->ordered_zone_append) { - zone->wp += nr_sectors; - if (zone->wp == zone_end) { - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = ULLONG_MAX; - } - } - - spin_unlock_irqrestore(&zone->wp_lock, flags); - } - - nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { - struct bio_vec *bvec; + struct bio_vec tmp, *bvec; cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO); - if (!cmd->bvec) { - ret = -EIO; - goto unlock; - } + if (!cmd->bvec) + return -EIO; /* * The bios of the request may be started from the middle of @@ -522,7 +550,7 @@ static void zloop_rw(struct zloop_cmd *cmd) iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; } - cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; + cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT; cmd->iocb.ki_filp = zone->file; cmd->iocb.ki_complete = zloop_rw_complete; if (!zlo->buffered_io) @@ -530,18 +558,166 @@ static void zloop_rw(struct zloop_cmd *cmd) cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) - ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); - else - ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); -unlock: - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) + return zone->file->f_op->write_iter(&cmd->iocb, &iter); + return zone->file->f_op->read_iter(&cmd->iocb, &iter); +} + +static int zloop_seq_write_prep(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + struct zloop_zone *zone = &zlo->zones[zone_no]; + sector_t zone_end = zone->start + zlo->zone_capacity; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&zone->wp_lock, flags); + + /* + * Zone append operations always go at the current write pointer, but + * regular write operations must already be aligned to the write pointer + * when submitted. + */ + if (is_append) { + /* + * If ordered zone append is in use, we already checked and set + * the target sector in zloop_queue_rq(). + */ + if (!zlo->ordered_zone_append) { + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + ret = -EIO; + goto out_unlock; + } + cmd->sector = zone->wp; + } + } else { + if (cmd->sector != zone->wp) { + pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", + zone_no, cmd->sector, zone->wp); + ret = -EIO; + goto out_unlock; + } + } + + /* Implicitly open the target zone. */ + if (!zloop_do_open_zone(zlo, zone, false)) { + ret = -EIO; + goto out_unlock; + } + + /* + * Advance the write pointer, unless ordered zone append is in use. If + * the write fails, the write pointer position will be corrected when + * the next I/O starts execution. + */ + if (!is_append || !zlo->ordered_zone_append) { + zone->wp += nr_sectors; + if (zone->wp == zone_end) { + zloop_lru_remove_open_zone(zlo, zone); + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + } +out_unlock: + spin_unlock_irqrestore(&zone->wp_lock, flags); + return ret; +} + +static void zloop_rw(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; + struct zloop_zone *zone; + int ret = -EIO; + + atomic_set(&cmd->ref, 2); + cmd->sector = blk_rq_pos(rq); + cmd->nr_sectors = nr_sectors; + cmd->ret = 0; + + if (WARN_ON_ONCE(is_append && !zlo->zone_append)) + goto out; + + /* We should never get an I/O beyond the device capacity. */ + if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) + goto out; + + zone = &zlo->zones[zone_no]; + + /* + * The block layer should never send requests that are not fully + * contained within the zone. + */ + if (WARN_ON_ONCE(cmd->sector + nr_sectors > + zone->start + zlo->zone_size)) + goto out; + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + mutex_lock(&zone->lock); + ret = zloop_update_seq_zone(zlo, zone_no); mutex_unlock(&zone->lock); + if (ret) + goto out; + } + + if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { + mutex_lock(&zone->lock); + ret = zloop_seq_write_prep(cmd); + if (!ret) + ret = zloop_do_rw(cmd); + mutex_unlock(&zone->lock); + } else { + ret = zloop_do_rw(cmd); + } out: if (ret != -EIOCBQUEUED) zloop_rw_complete(&cmd->iocb, ret); zloop_put_cmd(cmd); } +static inline bool zloop_zone_is_active(struct zloop_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_CLOSED: + return true; + default: + return false; + } +} + +static int zloop_record_safe_wps(struct zloop_device *zlo) +{ + unsigned int i; + int ret; + + for (i = 0; i < zlo->nr_zones; i++) { + struct zloop_zone *zone = &zlo->zones[i]; + struct file *file = zone->file; + + if (!zloop_zone_is_active(zone)) + continue; + ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file), + "user.zloop.wp", &zone->wp, sizeof(zone->wp), 0); + if (ret) { + pr_err("%pg: failed to record write pointer (%d)\n", + zlo->disk->part0, ret); + return ret; + } + } + + return 0; +} + /* * Sync the entire FS containing the zone files instead of walking all files. */ @@ -550,6 +726,12 @@ static int zloop_flush(struct zloop_device *zlo) struct super_block *sb = file_inode(zlo->data_dir)->i_sb; int ret; + if (zlo->discard_write_cache) { + ret = zloop_record_safe_wps(zlo); + if (ret) + return ret; + } + down_read(&sb->s_umount); ret = sync_filesystem(sb); up_read(&sb->s_umount); @@ -692,6 +874,7 @@ static bool zloop_set_zone_append_sector(struct request *rq) rq->__sector = zone->wp; zone->wp += blk_rq_sectors(rq); if (zone->wp >= zone_end) { + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; } @@ -889,6 +1072,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, int ret; mutex_init(&zone->lock); + INIT_LIST_HEAD(&zone->open_zone_entry); spin_lock_init(&zone->wp_lock); zone->start = (sector_t)zone_no << zlo->zone_shift; @@ -1009,12 +1193,20 @@ static int zloop_ctl_add(struct zloop_options *opts) goto out; } + if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) { + pr_err("Invalid maximum number of open zones %u\n", + opts->max_open_zones); + goto out; + } + zlo = kvzalloc_flex(*zlo, zones, nr_zones); if (!zlo) { ret = -ENOMEM; goto out; } WRITE_ONCE(zlo->state, Zlo_creating); + spin_lock_init(&zlo->open_zones_lock); + INIT_LIST_HEAD(&zlo->open_zones_lru_list); ret = mutex_lock_killable(&zloop_ctl_mutex); if (ret) @@ -1042,10 +1234,12 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->zone_capacity = zlo->zone_size; zlo->nr_zones = nr_zones; zlo->nr_conv_zones = opts->nr_conv_zones; + zlo->max_open_zones = opts->max_open_zones; zlo->buffered_io = opts->buffered_io; zlo->zone_append = opts->zone_append; if (zlo->zone_append) zlo->ordered_zone_append = opts->ordered_zone_append; + zlo->discard_write_cache = opts->discard_write_cache; zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, opts->nr_queues * opts->queue_depth, zlo->id); @@ -1088,6 +1282,7 @@ static int zloop_ctl_add(struct zloop_options *opts) lim.logical_block_size = zlo->block_size; if (zlo->zone_append) lim.max_hw_zone_append_sectors = lim.max_hw_sectors; + lim.max_open_zones = zlo->max_open_zones; zlo->tag_set.ops = &zloop_mq_ops; zlo->tag_set.nr_hw_queues = opts->nr_queues; @@ -1168,6 +1363,49 @@ out: return ret; } +static void zloop_truncate(struct file *file, loff_t pos) +{ + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct dentry *dentry = file_dentry(file); + struct iattr newattrs; + + newattrs.ia_size = pos; + newattrs.ia_valid = ATTR_SIZE; + + inode_lock(dentry->d_inode); + notify_change(idmap, dentry, &newattrs, NULL); + inode_unlock(dentry->d_inode); +} + +static void zloop_forget_cache(struct zloop_device *zlo) +{ + unsigned int i; + int ret; + + pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0); + + for (i = 0; i < zlo->nr_zones; i++) { + struct zloop_zone *zone = &zlo->zones[i]; + struct file *file = zone->file; + sector_t old_wp; + + if (!zloop_zone_is_active(zone)) + continue; + + ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file), + "user.zloop.wp", &old_wp, sizeof(old_wp)); + if (ret == -ENODATA) { + old_wp = 0; + } else if (ret != sizeof(old_wp)) { + pr_err("%pg: failed to retrieve write pointer (%d)\n", + zlo->disk->part0, ret); + continue; + } + if (old_wp < zone->wp) + zloop_truncate(file, old_wp); + } +} + static int zloop_ctl_remove(struct zloop_options *opts) { struct zloop_device *zlo; @@ -1202,6 +1440,10 @@ static int zloop_ctl_remove(struct zloop_options *opts) return ret; del_gendisk(zlo->disk); + + if (zlo->discard_write_cache) + zloop_forget_cache(zlo); + put_disk(zlo->disk); pr_info("Removed device %d\n", opts->id); @@ -1224,6 +1466,7 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES; opts->zone_size = ZLOOP_DEF_ZONE_SIZE; opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES; + opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES; opts->nr_queues = ZLOOP_DEF_NR_QUEUES; opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; @@ -1302,6 +1545,13 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) } opts->nr_conv_zones = token; break; + case ZLOOP_OPT_MAX_OPEN_ZONES: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + opts->max_open_zones = token; + break; case ZLOOP_OPT_BASE_DIR: p = match_strdup(args); if (!p) { @@ -1353,6 +1603,9 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) case ZLOOP_OPT_ORDERED_ZONE_APPEND: opts->ordered_zone_append = true; break; + case ZLOOP_OPT_DISCARD_WRITE_CACHE: + opts->discard_write_cache = true; + break; case ZLOOP_OPT_ERR: default: pr_warn("unknown parameter or missing value '%s'\n", p); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 64bb38c95895..97d9adb0bf96 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1373,6 +1373,14 @@ static CLOSURE_CALLBACK(cached_dev_free) mutex_unlock(&bch_register_lock); + /* + * Wait for any pending sb_write to complete before free. + * The sb_bio is embedded in struct cached_dev, so we must + * ensure no I/O is in progress. + */ + down(&dc->sb_write_mutex); + up(&dc->sb_write_mutex); + if (dc->sb_disk) folio_put(virt_to_folio(dc->sb_disk)); diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index bf398d7476b3..9e7e6b1a6f15 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -208,6 +208,20 @@ enum llbitmap_state { BitNeedSync, /* data is synchronizing */ BitSyncing, + /* + * Proactive sync requested for unwritten region (raid456 only). + * Triggered via sysfs when user wants to pre-build XOR parity + * for regions that have never been written. + */ + BitNeedSyncUnwritten, + /* Proactive sync in progress for unwritten region */ + BitSyncingUnwritten, + /* + * XOR parity has been pre-built for a region that has never had + * user data written. When user writes to this region, it transitions + * to BitDirty. + */ + BitCleanUnwritten, BitStateCount, BitNone = 0xff, }; @@ -232,6 +246,12 @@ enum llbitmap_action { * BitNeedSync. */ BitmapActionStale, + /* + * Proactive sync trigger for raid456 - builds XOR parity for + * Unwritten regions without requiring user data write first. + */ + BitmapActionProactiveSync, + BitmapActionClearUnwritten, BitmapActionCount, /* Init state is BitUnwritten */ BitmapActionInit, @@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitNone, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNeedSyncUnwritten, + [BitmapActionClearUnwritten] = BitNone, }, [BitClean] = { [BitmapActionStartwrite] = BitDirty, @@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitDirty] = { [BitmapActionStartwrite] = BitNone, @@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitClean, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitNeedSync] = { [BitmapActionStartwrite] = BitNone, @@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitSyncing] = { [BitmapActionStartwrite] = BitNone, @@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, + }, + [BitNeedSyncUnwritten] = { + [BitmapActionStartwrite] = BitNeedSync, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitSyncingUnwritten] = { + [BitmapActionStartwrite] = BitSyncing, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitCleanUnwritten, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitCleanUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, }, }; @@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; break; case BitClean: + case BitCleanUnwritten: pctl->state[pos] = BitDirty; break; } @@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, } static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, - int offset) + int offset, bool infect) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; unsigned int io_size = llbitmap->io_size; @@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, * resync all the dirty bits, hence skip infect new dirty bits to * prevent resync unnecessary data. */ - if (llbitmap->mddev->degraded) { + if (llbitmap->mddev->degraded || !infect) { set_bit(block, pctl->dirty); return; } @@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, llbitmap->pctl[idx]->state[bit] = state; if (state == BitDirty || state == BitNeedSync) - llbitmap_set_page_dirty(llbitmap, idx, bit); + llbitmap_set_page_dirty(llbitmap, idx, bit, true); + else if (state == BitNeedSyncUnwritten) + llbitmap_set_page_dirty(llbitmap, idx, bit, false); } static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) @@ -459,7 +528,8 @@ static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) rdev_for_each(rdev, mddev) { sector_t sector; - if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) continue; sector = mddev->bitmap_info.offset + @@ -584,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap) return 0; } +/* + * Check if all underlying disks support write_zeroes with unmap. + */ +static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0) + return false; + } + + return true; +} + +/* + * Issue write_zeroes to all underlying disks to zero their data regions. + * This ensures parity consistency for RAID-456 (0 XOR 0 = 0). + * Returns true if all disks were successfully zeroed. + */ +static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + sector_t dev_sectors = mddev->dev_sectors; + int ret; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + ret = blkdev_issue_zeroout(rdev->bdev, + rdev->data_offset, + dev_sectors, + GFP_KERNEL, 0); + if (ret) { + pr_warn("md/llbitmap: failed to zero disk %pg: %d\n", + rdev->bdev, ret); + return false; + } + } + + return true; +} + static void llbitmap_init_state(struct llbitmap *llbitmap) { + struct mddev *mddev = llbitmap->mddev; enum llbitmap_state state = BitUnwritten; unsigned long i; - if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) { state = BitClean; + } else if (raid_is_456(mddev) && + llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) { + /* + * All disks support write_zeroes with unmap. Zero all disks + * to ensure parity consistency, then set BitCleanUnwritten + * to skip initial sync. + */ + if (llbitmap_zero_all_disks(llbitmap)) + state = BitCleanUnwritten; + } for (i = 0; i < llbitmap->chunks; i++) llbitmap_write(llbitmap, state, i); @@ -626,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, goto write_bitmap; } - if (c == BitNeedSync) + if (c == BitNeedSync || c == BitNeedSyncUnwritten) need_resync = !mddev->degraded; state = state_machine[c][action]; - write_bitmap: if (unlikely(mddev->degraded)) { /* For degraded array, mark new data as need sync. */ @@ -657,8 +786,7 @@ write_bitmap: } llbitmap_write(llbitmap, state, start); - - if (state == BitNeedSync) + if (state == BitNeedSync || state == BitNeedSyncUnwritten) need_resync = !mddev->degraded; else if (state == BitDirty && !timer_pending(&llbitmap->pending_timer)) @@ -1069,12 +1197,12 @@ static void llbitmap_start_write(struct mddev *mddev, sector_t offset, int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; - llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); - while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); } static void llbitmap_end_write(struct mddev *mddev, sector_t offset, @@ -1101,12 +1229,12 @@ static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; - llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); - while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); } static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, @@ -1228,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) unsigned long p = offset >> llbitmap->chunkshift; enum llbitmap_state c = llbitmap_read(llbitmap, p); - return c == BitClean || c == BitDirty; + return c == BitClean || c == BitDirty || c == BitCleanUnwritten; } static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) @@ -1242,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) if (c == BitUnwritten) return blocks; + /* Skip CleanUnwritten - no user data, will be reset after recovery */ + if (c == BitCleanUnwritten) + return blocks; + /* For degraded array, don't skip */ if (mddev->degraded) return 0; @@ -1260,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state state; + + /* + * Before recovery starts, convert CleanUnwritten to Unwritten. + * This ensures the new disk won't have stale parity data. + */ + if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionClearUnwritten); + /* * Handle one bit at a time, this is much simpler. And it doesn't matter * if md_do_sync() loop more times. */ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); - return llbitmap_state_machine(llbitmap, p, p, - BitmapActionStartsync) == BitSyncing; + state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync); + return state == BitSyncing || state == BitSyncingUnwritten; } /* Something is wrong, sync_thread stop at @offset */ @@ -1473,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page) } mutex_unlock(&mddev->bitmap_info.mutex); - return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + return sprintf(page, + "unwritten %d\nclean %d\ndirty %d\n" + "need sync %d\nsyncing %d\n" + "need sync unwritten %d\nsyncing unwritten %d\n" + "clean unwritten %d\n", bits[BitUnwritten], bits[BitClean], bits[BitDirty], - bits[BitNeedSync], bits[BitSyncing]); + bits[BitNeedSync], bits[BitSyncing], + bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten], + bits[BitCleanUnwritten]); } static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); @@ -1548,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); +static ssize_t +proactive_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap; + + /* Only for RAID-456 */ + if (!raid_is_456(mddev)) + return -EINVAL; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return -ENODEV; + } + + /* Trigger proactive sync on all Unwritten regions */ + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionProactiveSync); + + mutex_unlock(&mddev->bitmap_info.mutex); + return len; +} + +static struct md_sysfs_entry llbitmap_proactive_sync = + __ATTR(proactive_sync, 0200, NULL, proactive_sync_store); + static struct attribute *md_llbitmap_attrs[] = { &llbitmap_bits.attr, &llbitmap_metadata.attr, &llbitmap_daemon_sleep.attr, &llbitmap_barrier_idle.attr, + &llbitmap_proactive_sync.attr, NULL }; diff --git a/drivers/md/md.c b/drivers/md/md.c index 3ce6f9e9d38e..5fb5ae8368ba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); -static struct workqueue_struct *md_wq; /* * This workqueue is used for sync_work to register new sync_thread, and for @@ -98,7 +97,7 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); -static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void export_rdev(struct md_rdev *rdev); static void md_wakeup_thread_directly(struct md_thread __rcu **thread); /* @@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev) spin_lock_init(&serial_tmp->serial_lock); serial_tmp->serial_rb = RB_ROOT_CACHED; - init_waitqueue_head(&serial_tmp->serial_io_wait); } rdev->serial = serial; @@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) } percpu_ref_kill(&mddev->active_io); + + /* + * RAID456 IO can sleep in wait_for_reshape while still holding an + * active_io reference. If reshape is already interrupted or frozen, + * wake those waiters so they can abort and drop the reference instead + * of deadlocking suspend. + */ + if (mddev->pers && mddev->pers->prepare_suspend && + reshape_interrupted(mddev)) + mddev->pers->prepare_suspend(mddev); + if (interruptible) err = wait_event_interruptible(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); @@ -959,7 +968,7 @@ void mddev_unlock(struct mddev *mddev) list_for_each_entry_safe(rdev, tmp, &delete, same_set) { list_del_init(&rdev->same_set); kobject_del(&rdev->kobj); - export_rdev(rdev, mddev); + export_rdev(rdev); } if (!legacy_async_del_gendisk) { @@ -2632,7 +2641,7 @@ void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ static struct md_rdev claim_rdev; -static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) +static void export_rdev(struct md_rdev *rdev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); @@ -2788,7 +2797,9 @@ void md_update_sb(struct mddev *mddev, int force_change) if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); + if (!mddev_is_dm(mddev)) + pr_err_ratelimited("%s: can't update sb for read-only array %s\n", + __func__, mdname(mddev)); return; } @@ -4848,7 +4859,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) err = bind_rdev_to_array(rdev, mddev); out: if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); mddev_unlock_and_resume(mddev); if (!err) md_new_event(); @@ -6128,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); - mddev_put(mddev); + /* + * For "array_state=clear", dropping the extra kobject reference from + * sysfs_break_active_protection() can trigger md kobject deletion. + * Restore active protection before mddev_put() so deletion happens + * after the sysfs write path fully unwinds. + */ if (kn) sysfs_unbreak_active_protection(kn); + mddev_put(mddev); return rv; } @@ -6447,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +/* + * Read bitmap superblock and return the bitmap_id based on disk version. + * This is used as fallback when default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set bitmap_type. + */ +static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct page *sb_page; + bitmap_super_t *sb; + enum md_submodule_id id = ID_BITMAP_NONE; + sector_t sector; + u32 version; + + if (!mddev->bitmap_info.offset) + return ID_BITMAP_NONE; + + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) { + pr_warn("md: %s: failed to allocate memory for bitmap\n", + mdname(mddev)); + return ID_BITMAP_NONE; + } + + sector = mddev->bitmap_info.offset; + + rdev_for_each(rdev, mddev) { + u32 iosize; + + if (!test_bit(In_sync, &rdev->flags) || + test_bit(Faulty, &rdev->flags) || + test_bit(Bitmap_sync, &rdev->flags)) + continue; + + iosize = roundup(sizeof(bitmap_super_t), + bdev_logical_block_size(rdev->bdev)); + if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ, + true)) + goto read_ok; + } + pr_warn("md: %s: failed to read bitmap from any device\n", + mdname(mddev)); + goto out; + +read_ok: + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_warn("md: %s: invalid bitmap magic 0x%x\n", + mdname(mddev), le32_to_cpu(sb->magic)); + goto out_unmap; + } + + version = le32_to_cpu(sb->version); + switch (version) { + case BITMAP_MAJOR_LO: + case BITMAP_MAJOR_HI: + case BITMAP_MAJOR_CLUSTERED: + id = ID_BITMAP; + break; + case BITMAP_MAJOR_LOCKLESS: + id = ID_LLBITMAP; + break; + default: + pr_warn("md: %s: unknown bitmap version %u\n", + mdname(mddev), version); + break; + } + +out_unmap: + kunmap_local(sb); +out: + __free_page(sb_page); + return id; +} + static int md_bitmap_create(struct mddev *mddev) { + enum md_submodule_id orig_id = mddev->bitmap_id; + enum md_submodule_id sb_id; + int err; + if (mddev->bitmap_id == ID_BITMAP_NONE) return -EINVAL; if (!mddev_set_bitmap_ops(mddev)) return -ENOENT; - return mddev->bitmap_ops->create(mddev); + err = mddev->bitmap_ops->create(mddev); + if (!err) + return 0; + + /* + * Create failed, if default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set + * bitmap_type, set bitmap_ops based on the disk version. + */ + mddev_clear_bitmap_ops(mddev); + + sb_id = md_bitmap_get_id_from_sb(mddev); + if (sb_id == ID_BITMAP_NONE || sb_id == orig_id) + return err; + + pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n", + mdname(mddev), orig_id, sb_id); + + mddev->bitmap_id = sb_id; + if (!mddev_set_bitmap_ops(mddev)) { + mddev->bitmap_id = orig_id; + return -ENOENT; + } + + err = mddev->bitmap_ops->create(mddev); + if (err) { + mddev_clear_bitmap_ops(mddev); + mddev->bitmap_id = orig_id; + } + + return err; } static void md_bitmap_destroy(struct mddev *mddev) @@ -7140,7 +7266,7 @@ static void autorun_devices(int part) rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) - export_rdev(rdev, mddev); + export_rdev(rdev); } autorun_array(mddev); mddev_unlock_and_resume(mddev); @@ -7150,7 +7276,7 @@ static void autorun_devices(int part) */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); - export_rdev(rdev, mddev); + export_rdev(rdev); } mddev_put(mddev); } @@ -7338,13 +7464,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, rdev0->bdev); - export_rdev(rdev, mddev); + export_rdev(rdev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } @@ -7387,7 +7513,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* This was a hot-add request, but events doesn't * match, so reject it. */ - export_rdev(rdev, mddev); + export_rdev(rdev); return -EINVAL; } @@ -7413,7 +7539,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) } } if (has_journal || mddev->bitmap) { - export_rdev(rdev, mddev); + export_rdev(rdev); return -EBUSY; } set_bit(Journal, &rdev->flags); @@ -7428,7 +7554,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* --add initiated by this node */ err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } } @@ -7438,7 +7564,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { @@ -7501,7 +7627,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) { - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } } @@ -7613,7 +7739,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) return 0; abort_export: - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } @@ -10503,10 +10629,6 @@ static int __init md_init(void) goto err_bitmap; ret = -ENOMEM; - md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0); - if (!md_wq) - goto err_wq; - md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0); if (!md_misc_wq) goto err_misc_wq; @@ -10531,8 +10653,6 @@ err_mdp: err_md: destroy_workqueue(md_misc_wq); err_misc_wq: - destroy_workqueue(md_wq); -err_wq: md_llbitmap_exit(); err_bitmap: md_bitmap_exit(); @@ -10841,7 +10961,6 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_wq); md_bitmap_exit(); } diff --git a/drivers/md/md.h b/drivers/md/md.h index ac84289664cd..d6f5482e2479 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,6 @@ enum sync_action { struct serial_in_rdev { struct rb_root_cached serial_rb; spinlock_t serial_lock; - wait_queue_head_t serial_io_wait; }; /* @@ -381,7 +380,11 @@ struct serial_info { struct rb_node node; sector_t start; /* start sector of rb node */ sector_t last; /* end sector of rb node */ + sector_t wnode_start; /* address of waiting nodes on the same list */ sector_t _subtree_last; /* highest sector in subtree of rb node */ + struct list_head list_node; + struct list_head waiters; + struct completion ready; }; /* diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ef0045db409f..5e38a51e349a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } err = -ENOMEM; - conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones); + conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones); if (!conf->strip_zone) goto abort; - conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *), - conf->nr_strip_zones, - mddev->raid_disks), - GFP_KERNEL); + conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *), + conf->nr_strip_zones, + mddev->raid_disks), + GFP_KERNEL); if (!conf->devlist) goto abort; @@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) return 0; abort: - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); *private_conf = ERR_PTR(err); return err; @@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 181400e147c0..ba91f7e61920 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, START, LAST, static inline, raid1_rb); static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, - struct serial_info *si, int idx) + struct serial_info *si) { unsigned long flags; int ret = 0; sector_t lo = r1_bio->sector; - sector_t hi = lo + r1_bio->sectors; + sector_t hi = lo + r1_bio->sectors - 1; + int idx = sector_to_idx(r1_bio->sector); struct serial_in_rdev *serial = &rdev->serial[idx]; + struct serial_info *head_si; spin_lock_irqsave(&serial->serial_lock, flags); /* collision happened */ - if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) - ret = -EBUSY; - else { + head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + if (head_si && head_si != si) { si->start = lo; si->last = hi; + si->wnode_start = head_si->wnode_start; + list_add_tail(&si->list_node, &head_si->waiters); + ret = -EBUSY; + } else if (!head_si) { + si->start = lo; + si->last = hi; + si->wnode_start = si->start; raid1_rb_insert(si, &serial->serial_rb); } spin_unlock_irqrestore(&serial->serial_lock, flags); @@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) { struct mddev *mddev = rdev->mddev; struct serial_info *si; - int idx = sector_to_idx(r1_bio->sector); - struct serial_in_rdev *serial = &rdev->serial[idx]; if (WARN_ON(!mddev->serial_info_pool)) return; si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, r1_bio, si, idx) == 0); + INIT_LIST_HEAD(&si->waiters); + INIT_LIST_HEAD(&si->list_node); + init_completion(&si->ready); + while (check_and_add_serial(rdev, r1_bio, si)) { + wait_for_completion(&si->ready); + reinit_completion(&si->ready); + } } static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *si; + struct serial_info *si, *iter_si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; @@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); si; si = raid1_rb_iter_next(si, lo, hi)) { if (si->start == lo && si->last == hi) { - raid1_rb_remove(si, &serial->serial_rb); - mempool_free(si, mddev->serial_info_pool); found = 1; break; } } - if (!found) + if (found) { + raid1_rb_remove(si, &serial->serial_rb); + if (!list_empty(&si->waiters)) { + list_for_each_entry(iter_si, &si->waiters, list_node) { + if (iter_si->wnode_start == si->wnode_start) { + list_del_init(&iter_si->list_node); + list_splice_init(&si->waiters, &iter_si->waiters); + raid1_rb_insert(iter_si, &serial->serial_rb); + complete(&iter_si->ready); + break; + } + } + } + mempool_free(si, mddev->serial_info_pool); + } else { WARN(1, "The write IO is not recorded for serialization\n"); + } spin_unlock_irqrestore(&serial->serial_lock, flags); - wake_up(&serial->serial_io_wait); } /* @@ -452,7 +475,7 @@ static void raid1_end_write_request(struct bio *bio) int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; + sector_t hi = r1_bio->sector + r1_bio->sectors - 1; bool ignore_error = !raid1_should_handle_error(bio) || (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); @@ -1878,7 +1901,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, if (info->rdev) return false; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { set_bit(Nonrot, &rdev->flags); WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0653b5d8545a..4901ebe45c87 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - nonrot = bdev_nonrot(rdev->bdev); + nonrot = !bdev_rot(rdev->bdev); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); if (min_pending > pending && nonrot) { @@ -1184,7 +1184,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) { - raid_end_bio_io(r10_bio); + free_r10bio(r10_bio); return; } @@ -1372,7 +1372,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sectors = r10_bio->sectors; if (!regular_request_wait(mddev, conf, bio, sectors)) { - raid_end_bio_io(r10_bio); + free_r10bio(r10_bio); return; } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 66b10cbda96d..7b7546bfa21f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, return -ENOMEM; while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; + payload = (void *)mb + mb_offset; payload_flush = (void *)mb + mb_offset; if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) @@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, payload->checksum[1]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ goto mismatch; - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); - } else { - /* DATA or PARITY payload */ + if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) { log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); } - + mb_offset += payload_len; } put_page(page); @@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; int dd; payload = (void *)mb + mb_offset; @@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { int i, count; + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > + le32_to_cpu(mb->meta_size)) + return -EINVAL; + count = le32_to_cpu(payload_flush->size) / sizeof(__le64); for (i = 0; i < count; ++i) { stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); @@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, } } - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); + mb_offset += payload_len; continue; } /* DATA or PARITY payload */ + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + return -EINVAL; + stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? raid5_compute_sector( conf, le64_to_cpu(payload->location), 0, &dd, @@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + mb_offset += payload_len; } return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8e8d431071b..6e79829c5acb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3916,6 +3916,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, break; } BUG_ON(other < 0); + if (test_bit(R5_LOCKED, &sh->dev[other].flags)) + return 0; pr_debug("Computing stripe %llu blocks %d,%d\n", (unsigned long long)sh->sector, disk_idx, other); @@ -4594,20 +4596,6 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) async_tx_quiesce(&tx); } -/* - * handle_stripe - do things to a stripe. - * - * We lock the stripe by setting STRIPE_ACTIVE and then examine the - * state of various bits to see what needs to be done. - * Possible results: - * return some read requests which now have data - * return some write requests which are safely on storage - * schedule a read on some buffers - * schedule a write of some buffers - * return confirmation of parity correctness - * - */ - static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) { struct r5conf *conf = sh->raid_conf; @@ -4901,6 +4889,18 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_bit(STRIPE_HANDLE, &head_sh->state); } +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. + * Possible results: + * return some read requests which now have data + * return some write requests which are safely on storage + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + */ static void handle_stripe(struct stripe_head *sh) { struct stripe_head_state s; @@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { - raid5_release_stripe(sh); + int hash; + + spin_lock_irq(&conf->device_lock); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, + &conf->temp_inactive_list[hash]); + spin_unlock_irq(&conf->device_lock); conf->retry_read_aligned = raid_bio; conf->retry_read_offset = scnt; return handled; @@ -7541,7 +7547,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) continue; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { conf->batch_bio_dispatch = false; break; } @@ -7780,6 +7786,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); + lim.chunk_sectors = lim.io_opt >> 9; lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 110b1c2d0a86..1c7b710fc9c1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -801,7 +801,6 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx) } #endif -void md_raid5_kick_device(struct r5conf *conf); int raid5_set_cache_size(struct mddev *mddev, int size); sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); void raid5_release_stripe(struct stripe_head *sh); diff --git a/drivers/nvme/common/.kunitconfig b/drivers/nvme/common/.kunitconfig new file mode 100644 index 000000000000..60a038dc9423 --- /dev/null +++ b/drivers/nvme/common/.kunitconfig @@ -0,0 +1,6 @@ +CONFIG_KUNIT=y +CONFIG_PCI=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_HOST_AUTH=y +CONFIG_NVME_AUTH_KUNIT_TEST=y diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index da963e4f3f1f..f1639db65fd3 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -7,9 +7,15 @@ config NVME_KEYRING config NVME_AUTH tristate select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA256 - select CRYPTO_SHA512 select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS - select CRYPTO_HKDF + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 + +config NVME_AUTH_KUNIT_TEST + tristate "KUnit tests for NVMe authentication" if !KUNIT_ALL_TESTS + depends on KUNIT && NVME_AUTH + default KUNIT_ALL_TESTS + help + Enable KUnit tests for some of the common code for NVMe over Fabrics + In-Band Authentication. diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile index 681514cf2e2f..fd9d01a60946 100644 --- a/drivers/nvme/common/Makefile +++ b/drivers/nvme/common/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_NVME_KEYRING) += nvme-keyring.o nvme-auth-y += auth.o nvme-keyring-y += keyring.o + +obj-$(CONFIG_NVME_AUTH_KUNIT_TEST) += tests/auth_kunit.o diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index e07e7d4bf8b6..2d325fb93083 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -9,14 +9,11 @@ #include #include #include -#include #include -#include +#include #include #include -#define HKDF_MAX_HASHLEN 64 - static u32 nvme_dhchap_seqnum; static DEFINE_MUTEX(nvme_dhchap_mutex); @@ -38,9 +35,9 @@ u32 nvme_auth_get_seqnum(void) } EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum); -static struct nvme_auth_dhgroup_map { - const char name[16]; - const char kpp[16]; +static const struct nvme_auth_dhgroup_map { + char name[16]; + char kpp[16]; } dhgroup_map[] = { [NVME_AUTH_DHGROUP_NULL] = { .name = "null", .kpp = "null" }, @@ -89,25 +86,21 @@ u8 nvme_auth_dhgroup_id(const char *dhgroup_name) } EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id); -static struct nvme_dhchap_hash_map { +static const struct nvme_dhchap_hash_map { int len; - const char hmac[15]; - const char digest[8]; + char hmac[15]; } hash_map[] = { [NVME_AUTH_HASH_SHA256] = { .len = 32, .hmac = "hmac(sha256)", - .digest = "sha256", }, [NVME_AUTH_HASH_SHA384] = { .len = 48, .hmac = "hmac(sha384)", - .digest = "sha384", }, [NVME_AUTH_HASH_SHA512] = { .len = 64, .hmac = "hmac(sha512)", - .digest = "sha512", }, }; @@ -119,14 +112,6 @@ const char *nvme_auth_hmac_name(u8 hmac_id) } EXPORT_SYMBOL_GPL(nvme_auth_hmac_name); -const char *nvme_auth_digest_name(u8 hmac_id) -{ - if (hmac_id >= ARRAY_SIZE(hash_map)) - return NULL; - return hash_map[hmac_id].digest; -} -EXPORT_SYMBOL_GPL(nvme_auth_digest_name); - u8 nvme_auth_hmac_id(const char *hmac_name) { int i; @@ -161,11 +146,10 @@ u32 nvme_auth_key_struct_size(u32 key_len) } EXPORT_SYMBOL_GPL(nvme_auth_key_struct_size); -struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, - u8 key_hash) +struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash) { struct nvme_dhchap_key *key; - unsigned char *p; + const char *p; u32 crc; int ret, key_len; size_t allocated_len = strlen(secret); @@ -183,14 +167,14 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, pr_debug("base64 key decoding error %d\n", key_len); ret = key_len; - goto out_free_secret; + goto out_free_key; } if (key_len != 36 && key_len != 52 && key_len != 68) { pr_err("Invalid key len %d\n", key_len); ret = -EINVAL; - goto out_free_secret; + goto out_free_key; } /* The last four bytes is the CRC in little-endian format */ @@ -205,12 +189,12 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, pr_err("key crc mismatch (key %08x, crc %08x)\n", get_unaligned_le32(key->key + key_len), crc); ret = -EKEYREJECTED; - goto out_free_secret; + goto out_free_key; } key->len = key_len; key->hash = key_hash; return key; -out_free_secret: +out_free_key: nvme_auth_free_key(key); return ERR_PTR(ret); } @@ -237,12 +221,106 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key) } EXPORT_SYMBOL_GPL(nvme_auth_free_key); -struct nvme_dhchap_key *nvme_auth_transform_key( - struct nvme_dhchap_key *key, char *nqn) +/* + * Start computing an HMAC value, given the algorithm ID and raw key. + * + * The context should be zeroized at the end of its lifetime. The caller can do + * that implicitly by calling nvme_auth_hmac_final(), or explicitly (needed when + * a context is abandoned without finalizing it) by calling memzero_explicit(). + */ +int nvme_auth_hmac_init(struct nvme_auth_hmac_ctx *hmac, u8 hmac_id, + const u8 *key, size_t key_len) { - const char *hmac_name; - struct crypto_shash *key_tfm; - SHASH_DESC_ON_STACK(shash, key_tfm); + hmac->hmac_id = hmac_id; + switch (hmac_id) { + case NVME_AUTH_HASH_SHA256: + hmac_sha256_init_usingrawkey(&hmac->sha256, key, key_len); + return 0; + case NVME_AUTH_HASH_SHA384: + hmac_sha384_init_usingrawkey(&hmac->sha384, key, key_len); + return 0; + case NVME_AUTH_HASH_SHA512: + hmac_sha512_init_usingrawkey(&hmac->sha512, key, key_len); + return 0; + } + pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_init); + +void nvme_auth_hmac_update(struct nvme_auth_hmac_ctx *hmac, const u8 *data, + size_t data_len) +{ + switch (hmac->hmac_id) { + case NVME_AUTH_HASH_SHA256: + hmac_sha256_update(&hmac->sha256, data, data_len); + return; + case NVME_AUTH_HASH_SHA384: + hmac_sha384_update(&hmac->sha384, data, data_len); + return; + case NVME_AUTH_HASH_SHA512: + hmac_sha512_update(&hmac->sha512, data, data_len); + return; + } + /* Unreachable because nvme_auth_hmac_init() validated hmac_id */ + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_update); + +/* Finish computing an HMAC value. Note that this zeroizes the HMAC context. */ +void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out) +{ + switch (hmac->hmac_id) { + case NVME_AUTH_HASH_SHA256: + hmac_sha256_final(&hmac->sha256, out); + return; + case NVME_AUTH_HASH_SHA384: + hmac_sha384_final(&hmac->sha384, out); + return; + case NVME_AUTH_HASH_SHA512: + hmac_sha512_final(&hmac->sha512, out); + return; + } + /* Unreachable because nvme_auth_hmac_init() validated hmac_id */ + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_final); + +static int nvme_auth_hmac(u8 hmac_id, const u8 *key, size_t key_len, + const u8 *data, size_t data_len, u8 *out) +{ + struct nvme_auth_hmac_ctx hmac; + int ret; + + ret = nvme_auth_hmac_init(&hmac, hmac_id, key, key_len); + if (ret == 0) { + nvme_auth_hmac_update(&hmac, data, data_len); + nvme_auth_hmac_final(&hmac, out); + } + return ret; +} + +static int nvme_auth_hash(u8 hmac_id, const u8 *data, size_t data_len, u8 *out) +{ + switch (hmac_id) { + case NVME_AUTH_HASH_SHA256: + sha256(data, data_len, out); + return 0; + case NVME_AUTH_HASH_SHA384: + sha384(data, data_len, out); + return 0; + case NVME_AUTH_HASH_SHA512: + sha512(data, data_len, out); + return 0; + } + pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); + return -EINVAL; +} + +struct nvme_dhchap_key *nvme_auth_transform_key( + const struct nvme_dhchap_key *key, const char *nqn) +{ + struct nvme_auth_hmac_ctx hmac; struct nvme_dhchap_key *transformed_key; int ret, key_len; @@ -257,118 +335,33 @@ struct nvme_dhchap_key *nvme_auth_transform_key( return ERR_PTR(-ENOMEM); return transformed_key; } - hmac_name = nvme_auth_hmac_name(key->hash); - if (!hmac_name) { - pr_warn("Invalid key hash id %d\n", key->hash); - return ERR_PTR(-EINVAL); - } - - key_tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(key_tfm)) - return ERR_CAST(key_tfm); - - key_len = crypto_shash_digestsize(key_tfm); + ret = nvme_auth_hmac_init(&hmac, key->hash, key->key, key->len); + if (ret) + return ERR_PTR(ret); + key_len = nvme_auth_hmac_hash_len(key->hash); transformed_key = nvme_auth_alloc_key(key_len, key->hash); if (!transformed_key) { - ret = -ENOMEM; - goto out_free_key; + memzero_explicit(&hmac, sizeof(hmac)); + return ERR_PTR(-ENOMEM); } - - shash->tfm = key_tfm; - ret = crypto_shash_setkey(key_tfm, key->key, key->len); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_init(shash); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_update(shash, nqn, strlen(nqn)); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_final(shash, transformed_key->key); - if (ret < 0) - goto out_free_transformed_key; - - crypto_free_shash(key_tfm); - + nvme_auth_hmac_update(&hmac, nqn, strlen(nqn)); + nvme_auth_hmac_update(&hmac, "NVMe-over-Fabrics", 17); + nvme_auth_hmac_final(&hmac, transformed_key->key); return transformed_key; - -out_free_transformed_key: - nvme_auth_free_key(transformed_key); -out_free_key: - crypto_free_shash(key_tfm); - - return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(nvme_auth_transform_key); -static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey) +int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *challenge, u8 *aug, size_t hlen) { - const char *digest_name; - struct crypto_shash *tfm; + u8 hashed_key[NVME_AUTH_MAX_DIGEST_SIZE]; int ret; - digest_name = nvme_auth_digest_name(hmac_id); - if (!digest_name) { - pr_debug("%s: failed to get digest for %d\n", __func__, - hmac_id); - return -EINVAL; - } - tfm = crypto_alloc_shash(digest_name, 0, 0); - if (IS_ERR(tfm)) - return -ENOMEM; - - ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey); - if (ret < 0) - pr_debug("%s: Failed to hash digest len %zu\n", __func__, - skey_len); - - crypto_free_shash(tfm); - return ret; -} - -int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *challenge, u8 *aug, size_t hlen) -{ - struct crypto_shash *tfm; - u8 *hashed_key; - const char *hmac_name; - int ret; - - hashed_key = kmalloc(hlen, GFP_KERNEL); - if (!hashed_key) - return -ENOMEM; - - ret = nvme_auth_hash_skey(hmac_id, skey, - skey_len, hashed_key); - if (ret < 0) - goto out_free_key; - - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { - pr_warn("%s: invalid hash algorithm %d\n", - __func__, hmac_id); - ret = -EINVAL; - goto out_free_key; - } - - tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out_free_key; - } - - ret = crypto_shash_setkey(tfm, hashed_key, hlen); + ret = nvme_auth_hash(hmac_id, skey, skey_len, hashed_key); if (ret) - goto out_free_hash; - - ret = crypto_shash_tfm_digest(tfm, challenge, hlen, aug); -out_free_hash: - crypto_free_shash(tfm); -out_free_key: - kfree_sensitive(hashed_key); + return ret; + ret = nvme_auth_hmac(hmac_id, hashed_key, hlen, challenge, hlen, aug); + memzero_explicit(hashed_key, sizeof(hashed_key)); return ret; } EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge); @@ -411,7 +404,7 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey); int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - u8 *ctrl_key, size_t ctrl_key_len, + const u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len) { struct kpp_request *req; @@ -438,7 +431,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, } EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret); -int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) +int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key) { struct nvme_dhchap_key *key; u8 key_hash; @@ -461,7 +454,7 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) *ret_key = key; return 0; } -EXPORT_SYMBOL_GPL(nvme_auth_generate_key); +EXPORT_SYMBOL_GPL(nvme_auth_parse_key); /** * nvme_auth_generate_psk - Generate a PSK for TLS @@ -486,66 +479,32 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_key); * Returns 0 on success with a valid generated PSK pointer in @ret_psk and * the length of @ret_psk in @ret_len, or a negative error number otherwise. */ -int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len) +int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *c1, const u8 *c2, size_t hash_len, + u8 **ret_psk, size_t *ret_len) { - struct crypto_shash *tfm; - SHASH_DESC_ON_STACK(shash, tfm); + size_t psk_len = nvme_auth_hmac_hash_len(hmac_id); + struct nvme_auth_hmac_ctx hmac; u8 *psk; - const char *hmac_name; - int ret, psk_len; + int ret; if (!c1 || !c2) return -EINVAL; - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { - pr_warn("%s: invalid hash algorithm %d\n", - __func__, hmac_id); - return -EINVAL; - } - - tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - psk_len = crypto_shash_digestsize(tfm); + ret = nvme_auth_hmac_init(&hmac, hmac_id, skey, skey_len); + if (ret) + return ret; psk = kzalloc(psk_len, GFP_KERNEL); if (!psk) { - ret = -ENOMEM; - goto out_free_tfm; + memzero_explicit(&hmac, sizeof(hmac)); + return -ENOMEM; } - - shash->tfm = tfm; - ret = crypto_shash_setkey(tfm, skey, skey_len); - if (ret) - goto out_free_psk; - - ret = crypto_shash_init(shash); - if (ret) - goto out_free_psk; - - ret = crypto_shash_update(shash, c1, hash_len); - if (ret) - goto out_free_psk; - - ret = crypto_shash_update(shash, c2, hash_len); - if (ret) - goto out_free_psk; - - ret = crypto_shash_final(shash, psk); - if (!ret) { - *ret_psk = psk; - *ret_len = psk_len; - } - -out_free_psk: - if (ret) - kfree_sensitive(psk); -out_free_tfm: - crypto_free_shash(tfm); - - return ret; + nvme_auth_hmac_update(&hmac, c1, hash_len); + nvme_auth_hmac_update(&hmac, c2, hash_len); + nvme_auth_hmac_final(&hmac, psk); + *ret_psk = psk; + *ret_len = psk_len; + return 0; } EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); @@ -584,158 +543,70 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); * Returns 0 on success with a valid digest pointer in @ret_digest, or a * negative error number on failure. */ -int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, - char *subsysnqn, char *hostnqn, u8 **ret_digest) +int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len, + const char *subsysnqn, const char *hostnqn, + char **ret_digest) { - struct crypto_shash *tfm; - SHASH_DESC_ON_STACK(shash, tfm); - u8 *digest, *enc; - const char *hmac_name; - size_t digest_len, hmac_len; + struct nvme_auth_hmac_ctx hmac; + u8 digest[NVME_AUTH_MAX_DIGEST_SIZE]; + size_t hash_len = nvme_auth_hmac_hash_len(hmac_id); + char *enc; + size_t enc_len; int ret; if (WARN_ON(!subsysnqn || !hostnqn)) return -EINVAL; - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { + if (hash_len == 0) { pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); return -EINVAL; } - switch (nvme_auth_hmac_hash_len(hmac_id)) { + switch (hash_len) { case 32: - hmac_len = 44; + enc_len = 44; break; case 48: - hmac_len = 64; + enc_len = 64; break; default: pr_warn("%s: invalid hash algorithm '%s'\n", - __func__, hmac_name); + __func__, nvme_auth_hmac_name(hmac_id)); return -EINVAL; } - enc = kzalloc(hmac_len + 1, GFP_KERNEL); - if (!enc) - return -ENOMEM; - - tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out_free_enc; - } - - digest_len = crypto_shash_digestsize(tfm); - digest = kzalloc(digest_len, GFP_KERNEL); - if (!digest) { + enc = kzalloc(enc_len + 1, GFP_KERNEL); + if (!enc) { ret = -ENOMEM; - goto out_free_tfm; + goto out; } - shash->tfm = tfm; - ret = crypto_shash_setkey(tfm, psk, psk_len); + ret = nvme_auth_hmac_init(&hmac, hmac_id, psk, psk_len); if (ret) - goto out_free_digest; + goto out; + nvme_auth_hmac_update(&hmac, hostnqn, strlen(hostnqn)); + nvme_auth_hmac_update(&hmac, " ", 1); + nvme_auth_hmac_update(&hmac, subsysnqn, strlen(subsysnqn)); + nvme_auth_hmac_update(&hmac, " NVMe-over-Fabrics", 18); + nvme_auth_hmac_final(&hmac, digest); - ret = crypto_shash_init(shash); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn)); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, " ", 1); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn)); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18); - if (ret) - goto out_free_digest; - - ret = crypto_shash_final(shash, digest); - if (ret) - goto out_free_digest; - - ret = base64_encode(digest, digest_len, enc, true, BASE64_STD); - if (ret < hmac_len) { + ret = base64_encode(digest, hash_len, enc, true, BASE64_STD); + if (ret < enc_len) { ret = -ENOKEY; - goto out_free_digest; + goto out; } *ret_digest = enc; ret = 0; -out_free_digest: - kfree_sensitive(digest); -out_free_tfm: - crypto_free_shash(tfm); -out_free_enc: +out: if (ret) kfree_sensitive(enc); - + memzero_explicit(digest, sizeof(digest)); return ret; } EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); -/** - * hkdf_expand_label - HKDF-Expand-Label (RFC 8846 section 7.1) - * @hmac_tfm: hash context keyed with pseudorandom key - * @label: ASCII label without "tls13 " prefix - * @labellen: length of @label - * @context: context bytes - * @contextlen: length of @context - * @okm: output keying material - * @okmlen: length of @okm - * - * Build the TLS 1.3 HkdfLabel structure and invoke hkdf_expand(). - * - * Returns 0 on success with output keying material stored in @okm, - * or a negative errno value otherwise. - */ -static int hkdf_expand_label(struct crypto_shash *hmac_tfm, - const u8 *label, unsigned int labellen, - const u8 *context, unsigned int contextlen, - u8 *okm, unsigned int okmlen) -{ - int err; - u8 *info; - unsigned int infolen; - const char *tls13_prefix = "tls13 "; - unsigned int prefixlen = strlen(tls13_prefix); - - if (WARN_ON(labellen > (255 - prefixlen))) - return -EINVAL; - if (WARN_ON(contextlen > 255)) - return -EINVAL; - - infolen = 2 + (1 + prefixlen + labellen) + (1 + contextlen); - info = kzalloc(infolen, GFP_KERNEL); - if (!info) - return -ENOMEM; - - /* HkdfLabel.Length */ - put_unaligned_be16(okmlen, info); - - /* HkdfLabel.Label */ - info[2] = prefixlen + labellen; - memcpy(info + 3, tls13_prefix, prefixlen); - memcpy(info + 3 + prefixlen, label, labellen); - - /* HkdfLabel.Context */ - info[3 + prefixlen + labellen] = contextlen; - memcpy(info + 4 + prefixlen + labellen, context, contextlen); - - err = hkdf_expand(hmac_tfm, info, infolen, okm, okmlen); - kfree_sensitive(info); - return err; -} - /** * nvme_auth_derive_tls_psk - Derive TLS PSK * @hmac_id: Hash function identifier @@ -763,82 +634,92 @@ static int hkdf_expand_label(struct crypto_shash *hmac_tfm, * Returns 0 on success with a valid psk pointer in @ret_psk or a negative * error number otherwise. */ -int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, - u8 *psk_digest, u8 **ret_psk) +int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len, + const char *psk_digest, u8 **ret_psk) { - struct crypto_shash *hmac_tfm; - const char *hmac_name; - const char *label = "nvme-tls-psk"; - static const char default_salt[HKDF_MAX_HASHLEN]; - size_t prk_len; - const char *ctx; - unsigned char *prk, *tls_key; + static const u8 default_salt[NVME_AUTH_MAX_DIGEST_SIZE]; + static const char label[] = "tls13 nvme-tls-psk"; + const size_t label_len = sizeof(label) - 1; + u8 prk[NVME_AUTH_MAX_DIGEST_SIZE]; + size_t hash_len, ctx_len; + u8 *hmac_data = NULL, *tls_key; + size_t i; int ret; - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { + hash_len = nvme_auth_hmac_hash_len(hmac_id); + if (hash_len == 0) { pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); return -EINVAL; } if (hmac_id == NVME_AUTH_HASH_SHA512) { pr_warn("%s: unsupported hash algorithm %s\n", - __func__, hmac_name); + __func__, nvme_auth_hmac_name(hmac_id)); return -EINVAL; } - hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(hmac_tfm)) - return PTR_ERR(hmac_tfm); - - prk_len = crypto_shash_digestsize(hmac_tfm); - prk = kzalloc(prk_len, GFP_KERNEL); - if (!prk) { - ret = -ENOMEM; - goto out_free_shash; + if (psk_len != hash_len) { + pr_warn("%s: unexpected psk_len %zu\n", __func__, psk_len); + return -EINVAL; } - if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) { + /* HKDF-Extract */ + ret = nvme_auth_hmac(hmac_id, default_salt, hash_len, psk, psk_len, + prk); + if (ret) + goto out; + + /* + * HKDF-Expand-Label (RFC 8446 section 7.1), with output length equal to + * the hash length (so only a single HMAC operation is needed) + */ + + hmac_data = kmalloc(/* output length */ 2 + + /* label */ 1 + label_len + + /* context (max) */ 1 + 3 + 1 + strlen(psk_digest) + + /* counter */ 1, + GFP_KERNEL); + if (!hmac_data) { + ret = -ENOMEM; + goto out; + } + /* output length */ + i = 0; + hmac_data[i++] = hash_len >> 8; + hmac_data[i++] = hash_len; + + /* label */ + static_assert(label_len <= 255); + hmac_data[i] = label_len; + memcpy(&hmac_data[i + 1], label, label_len); + i += 1 + label_len; + + /* context */ + ctx_len = sprintf(&hmac_data[i + 1], "%02d %s", hmac_id, psk_digest); + if (ctx_len > 255) { ret = -EINVAL; - goto out_free_prk; + goto out; } - ret = hkdf_extract(hmac_tfm, psk, psk_len, - default_salt, prk_len, prk); - if (ret) - goto out_free_prk; + hmac_data[i] = ctx_len; + i += 1 + ctx_len; - ret = crypto_shash_setkey(hmac_tfm, prk, prk_len); - if (ret) - goto out_free_prk; - - ctx = kasprintf(GFP_KERNEL, "%02d %s", hmac_id, psk_digest); - if (!ctx) { - ret = -ENOMEM; - goto out_free_prk; - } + /* counter (this overwrites the NUL terminator written by sprintf) */ + hmac_data[i++] = 1; tls_key = kzalloc(psk_len, GFP_KERNEL); if (!tls_key) { ret = -ENOMEM; - goto out_free_ctx; + goto out; } - ret = hkdf_expand_label(hmac_tfm, - label, strlen(label), - ctx, strlen(ctx), - tls_key, psk_len); + ret = nvme_auth_hmac(hmac_id, prk, hash_len, hmac_data, i, tls_key); if (ret) { - kfree(tls_key); - goto out_free_ctx; + kfree_sensitive(tls_key); + goto out; } *ret_psk = tls_key; - -out_free_ctx: - kfree(ctx); -out_free_prk: - kfree(prk); -out_free_shash: - crypto_free_shash(hmac_tfm); - +out: + kfree_sensitive(hmac_data); + memzero_explicit(prk, sizeof(prk)); return ret; } EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk); diff --git a/drivers/nvme/common/tests/auth_kunit.c b/drivers/nvme/common/tests/auth_kunit.c new file mode 100644 index 000000000000..28b8dd1e3b18 --- /dev/null +++ b/drivers/nvme/common/tests/auth_kunit.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Unit tests for NVMe authentication functions + * + * Copyright 2026 Google LLC + */ + +#include +#include +#include +#include +#include + +struct nvme_auth_test_values { + u8 hmac_id; + size_t hash_len; + u8 expected_psk[NVME_AUTH_MAX_DIGEST_SIZE]; + char *expected_psk_digest; + u8 expected_tls_psk[NVME_AUTH_MAX_DIGEST_SIZE]; +}; + +static void kfree_action(void *ptr) +{ + kfree(ptr); +} + +static void kunit_add_kfree_action(struct kunit *test, void *ptr) +{ + KUNIT_ASSERT_EQ(test, 0, + kunit_add_action_or_reset(test, kfree_action, ptr)); +} + +/* + * Test the derivation of a TLS PSK from the initial skey. The vals parameter + * gives the expected value of tls_psk as well as the intermediate values psk + * and psk_digest. The inputs are implicitly the fixed values set below. + */ +static void +test_nvme_auth_derive_tls_psk(struct kunit *test, + const struct nvme_auth_test_values *vals) +{ + const u8 hmac_id = vals->hmac_id; + const size_t hash_len = vals->hash_len; + const size_t skey_len = hash_len; + u8 skey[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 c1[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 c2[NVME_AUTH_MAX_DIGEST_SIZE]; + const char *subsysnqn = "subsysnqn"; + const char *hostnqn = "hostnqn"; + u8 *psk = NULL, *tls_psk = NULL; + char *psk_digest = NULL; + size_t psk_len; + int ret; + + for (int i = 0; i < NVME_AUTH_MAX_DIGEST_SIZE; i++) { + skey[i] = 'A' + i; + c1[i] = i; + c2[i] = 0xff - i; + } + + ret = nvme_auth_generate_psk(hmac_id, skey, skey_len, c1, c2, hash_len, + &psk, &psk_len); + kunit_add_kfree_action(test, psk); + KUNIT_ASSERT_EQ(test, 0, ret); + KUNIT_ASSERT_EQ(test, hash_len, psk_len); + KUNIT_ASSERT_MEMEQ(test, vals->expected_psk, psk, psk_len); + + ret = nvme_auth_generate_digest(hmac_id, psk, psk_len, subsysnqn, + hostnqn, &psk_digest); + kunit_add_kfree_action(test, psk_digest); + if (vals->expected_psk_digest == NULL) { + /* + * Algorithm has an ID assigned but is not supported by + * nvme_auth_generate_digest(). + */ + KUNIT_ASSERT_EQ(test, -EINVAL, ret); + return; + } + KUNIT_ASSERT_EQ(test, 0, ret); + KUNIT_ASSERT_STREQ(test, vals->expected_psk_digest, psk_digest); + + ret = nvme_auth_derive_tls_psk(hmac_id, psk, psk_len, psk_digest, + &tls_psk); + kunit_add_kfree_action(test, tls_psk); + KUNIT_ASSERT_EQ(test, 0, ret); + KUNIT_ASSERT_MEMEQ(test, vals->expected_tls_psk, tls_psk, psk_len); +} + +static void test_nvme_auth_derive_tls_psk_hmac_sha256(struct kunit *test) +{ + static const struct nvme_auth_test_values vals = { + .hmac_id = NVME_AUTH_HASH_SHA256, + .hash_len = SHA256_DIGEST_SIZE, + .expected_psk = { + 0x17, 0x33, 0xc5, 0x9f, 0xa7, 0xf4, 0x8f, 0xcf, + 0x37, 0xf5, 0xf2, 0x6f, 0xc4, 0xff, 0x02, 0x68, + 0xad, 0x4f, 0x78, 0xe0, 0x30, 0xf4, 0xf3, 0xb0, + 0xbf, 0xd1, 0xd4, 0x7e, 0x7b, 0xb1, 0x44, 0x7a, + }, + .expected_psk_digest = "OldoKuTfKddMuyCznAZojkWD7P4D9/AtzDzLimtOxqI=", + .expected_tls_psk = { + 0x3c, 0x17, 0xda, 0x62, 0x84, 0x74, 0xa0, 0x4d, + 0x22, 0x47, 0xc4, 0xca, 0xb4, 0x79, 0x68, 0xc9, + 0x15, 0x38, 0x81, 0x93, 0xf7, 0xc0, 0x71, 0xbd, + 0x94, 0x89, 0xcc, 0x36, 0x66, 0xcd, 0x7c, 0xc8, + }, + }; + + test_nvme_auth_derive_tls_psk(test, &vals); +} + +static void test_nvme_auth_derive_tls_psk_hmac_sha384(struct kunit *test) +{ + static const struct nvme_auth_test_values vals = { + .hmac_id = NVME_AUTH_HASH_SHA384, + .hash_len = SHA384_DIGEST_SIZE, + .expected_psk = { + 0xf1, 0x4b, 0x2d, 0xd3, 0x23, 0x4c, 0x45, 0x96, + 0x94, 0xd3, 0xbc, 0x63, 0xf8, 0x96, 0x8b, 0xd6, + 0xb3, 0x7c, 0x2c, 0x6d, 0xe8, 0x49, 0xe2, 0x2e, + 0x11, 0x87, 0x49, 0x00, 0x1c, 0xe4, 0xbb, 0xe8, + 0x64, 0x0b, 0x9e, 0x3a, 0x74, 0x8c, 0xb1, 0x1c, + 0xe4, 0xb1, 0xd7, 0x1d, 0x35, 0x9c, 0xce, 0x39, + }, + .expected_psk_digest = "cffMWk8TSS7HOQebjgYEIkrPrjWPV4JE5cdPB8WhEvY4JBW5YynKyv66XscN4A9n", + .expected_tls_psk = { + 0x27, 0x74, 0x75, 0x32, 0x33, 0x53, 0x7b, 0x3f, + 0xa5, 0x0e, 0xb7, 0xd1, 0x6a, 0x8e, 0x43, 0x45, + 0x7d, 0x85, 0xf4, 0x90, 0x6c, 0x00, 0x5b, 0x22, + 0x36, 0x61, 0x6c, 0x5d, 0x80, 0x93, 0x9d, 0x08, + 0x98, 0xff, 0xf1, 0x5b, 0xb8, 0xb7, 0x71, 0x19, + 0xd2, 0xbe, 0x0a, 0xac, 0x42, 0x3e, 0x75, 0x90, + }, + }; + + test_nvme_auth_derive_tls_psk(test, &vals); +} + +static void test_nvme_auth_derive_tls_psk_hmac_sha512(struct kunit *test) +{ + static const struct nvme_auth_test_values vals = { + .hmac_id = NVME_AUTH_HASH_SHA512, + .hash_len = SHA512_DIGEST_SIZE, + .expected_psk = { + 0x9c, 0x9f, 0x08, 0x9a, 0x61, 0x8b, 0x47, 0xd2, + 0xd7, 0x5f, 0x4b, 0x6c, 0x28, 0x07, 0x04, 0x24, + 0x48, 0x7b, 0x44, 0x5d, 0xd9, 0x6e, 0x70, 0xc4, + 0xc0, 0x9b, 0x55, 0xe8, 0xb6, 0x00, 0x01, 0x52, + 0xa3, 0x36, 0x3c, 0x34, 0x54, 0x04, 0x3f, 0x38, + 0xf0, 0xb8, 0x50, 0x36, 0xde, 0xd4, 0x06, 0x55, + 0x35, 0x0a, 0xa8, 0x7b, 0x8b, 0x6a, 0x28, 0x2b, + 0x5c, 0x1a, 0xca, 0xe1, 0x62, 0x33, 0xdd, 0x5b, + }, + /* nvme_auth_generate_digest() doesn't support SHA-512 yet. */ + .expected_psk_digest = NULL, + }; + + test_nvme_auth_derive_tls_psk(test, &vals); +} + +static struct kunit_case nvme_auth_test_cases[] = { + KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha256), + KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha384), + KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha512), + {}, +}; + +static struct kunit_suite nvme_auth_test_suite = { + .name = "nvme-auth", + .test_cases = nvme_auth_test_cases, +}; +kunit_test_suite(nvme_auth_test_suite); + +MODULE_DESCRIPTION("Unit tests for NVMe authentication functions"); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 405e7c03b1cf..bbedbe181c8a 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "nvme.h" #include "fabrics.h" @@ -22,7 +21,6 @@ struct nvme_dhchap_queue_context { struct list_head entry; struct work_struct auth_work; struct nvme_ctrl *ctrl; - struct crypto_shash *shash_tfm; struct crypto_kpp *dh_tfm; struct nvme_dhchap_key *transformed_key; void *buf; @@ -38,9 +36,9 @@ struct nvme_dhchap_queue_context { u8 hash_id; u8 sc_c; size_t hash_len; - u8 c1[64]; - u8 c2[64]; - u8 response[64]; + u8 c1[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 c2[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 response[NVME_AUTH_MAX_DIGEST_SIZE]; u8 *ctrl_key; u8 *host_key; u8 *sess_key; @@ -125,6 +123,8 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, { struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); + u8 dh_list_offset = NVME_AUTH_DHCHAP_MAX_DH_IDS; + u8 *idlist = data->auth_protocol[0].dhchap.idlist; if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; @@ -141,21 +141,22 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, data->sc_c = NVME_AUTH_SECP_NEWTLSPSK; } else data->sc_c = NVME_AUTH_SECP_NOSC; + chap->sc_c = data->sc_c; data->napd = 1; data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; data->auth_protocol[0].dhchap.halen = 3; - data->auth_protocol[0].dhchap.dhlen = 6; - data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256; - data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384; - data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512; - data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL; - data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048; - data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072; - data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096; - data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144; - data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192; - - chap->sc_c = data->sc_c; + idlist[0] = NVME_AUTH_HASH_SHA256; + idlist[1] = NVME_AUTH_HASH_SHA384; + idlist[2] = NVME_AUTH_HASH_SHA512; + if (chap->sc_c == NVME_AUTH_SECP_NOSC) + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_NULL; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_2048; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_3072; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_4096; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_6144; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_8192; + data->auth_protocol[0].dhchap.dhlen = + dh_list_offset - NVME_AUTH_DHCHAP_MAX_DH_IDS; return size; } @@ -183,38 +184,17 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, return -EPROTO; } - if (chap->hash_id == data->hashid && chap->shash_tfm && - !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) && - crypto_shash_digestsize(chap->shash_tfm) == data->hl) { + if (chap->hash_id == data->hashid && chap->hash_len == data->hl) { dev_dbg(ctrl->device, "qid %d: reuse existing hash %s\n", chap->qid, hmac_name); goto select_kpp; } - /* Reset if hash cannot be reused */ - if (chap->shash_tfm) { - crypto_free_shash(chap->shash_tfm); - chap->hash_id = 0; - chap->hash_len = 0; - } - chap->shash_tfm = crypto_alloc_shash(hmac_name, 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(chap->shash_tfm)) { - dev_warn(ctrl->device, - "qid %d: failed to allocate hash %s, error %ld\n", - chap->qid, hmac_name, PTR_ERR(chap->shash_tfm)); - chap->shash_tfm = NULL; - chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return -ENOMEM; - } - - if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) { + if (nvme_auth_hmac_hash_len(data->hashid) != data->hl) { dev_warn(ctrl->device, "qid %d: invalid hash length %d\n", chap->qid, data->hl); - crypto_free_shash(chap->shash_tfm); - chap->shash_tfm = NULL; chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; return -EPROTO; } @@ -434,7 +414,7 @@ static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl, static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, struct nvme_dhchap_queue_context *chap) { - SHASH_DESC_ON_STACK(shash, chap->shash_tfm); + struct nvme_auth_hmac_ctx hmac; u8 buf[4], *challenge = chap->c1; int ret; @@ -454,13 +434,11 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, __func__, chap->qid); } - ret = crypto_shash_setkey(chap->shash_tfm, - chap->transformed_key->key, chap->transformed_key->len); - if (ret) { - dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", - chap->qid, ret); + ret = nvme_auth_hmac_init(&hmac, chap->hash_id, + chap->transformed_key->key, + chap->transformed_key->len); + if (ret) goto out; - } if (chap->dh_tfm) { challenge = kmalloc(chap->hash_len, GFP_KERNEL); @@ -477,51 +455,36 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, goto out; } - shash->tfm = chap->shash_tfm; - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, chap->hash_len); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, challenge, chap->hash_len); + put_unaligned_le32(chap->s1, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(chap->transaction, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + *buf = chap->sc_c; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "HostHost", 8); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->host->nqn, - strlen(ctrl->opts->host->nqn)); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "HostHost", 8); + nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn, + strlen(ctrl->opts->host->nqn)); memset(buf, 0, sizeof(buf)); - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, - strlen(ctrl->opts->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, chap->response); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn, + strlen(ctrl->opts->subsysnqn)); + nvme_auth_hmac_final(&hmac, chap->response); + ret = 0; out: if (challenge != chap->c1) kfree(challenge); + memzero_explicit(&hmac, sizeof(hmac)); return ret; } static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, struct nvme_dhchap_queue_context *chap) { - SHASH_DESC_ON_STACK(shash, chap->shash_tfm); + struct nvme_auth_hmac_ctx hmac; struct nvme_dhchap_key *transformed_key; u8 buf[4], *challenge = chap->c2; int ret; @@ -533,10 +496,10 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, return ret; } - ret = crypto_shash_setkey(chap->shash_tfm, - transformed_key->key, transformed_key->len); + ret = nvme_auth_hmac_init(&hmac, chap->hash_id, transformed_key->key, + transformed_key->len); if (ret) { - dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", + dev_warn(ctrl->device, "qid %d: failed to init hmac, error %d\n", chap->qid, ret); goto out; } @@ -563,43 +526,29 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, __func__, chap->qid, ctrl->opts->subsysnqn); dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n", __func__, chap->qid, ctrl->opts->host->nqn); - shash->tfm = chap->shash_tfm; - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, chap->hash_len); - if (ret) - goto out; + + nvme_auth_hmac_update(&hmac, challenge, chap->hash_len); + put_unaligned_le32(chap->s2, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(chap->transaction, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + memset(buf, 0, 4); - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "Controller", 10); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, - strlen(ctrl->opts->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->host->nqn, - strlen(ctrl->opts->host->nqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, chap->response); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "Controller", 10); + nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn, + strlen(ctrl->opts->subsysnqn)); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn, + strlen(ctrl->opts->host->nqn)); + nvme_auth_hmac_final(&hmac, chap->response); + ret = 0; out: if (challenge != chap->c2) kfree(challenge); + memzero_explicit(&hmac, sizeof(hmac)); nvme_auth_free_key(transformed_key); return ret; } @@ -689,8 +638,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { nvme_auth_reset_dhchap(chap); chap->authenticated = false; - if (chap->shash_tfm) - crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); } @@ -708,7 +655,8 @@ EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key); static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl, struct nvme_dhchap_queue_context *chap) { - u8 *psk, *digest, *tls_psk; + u8 *psk, *tls_psk; + char *digest; struct key *tls_key; size_t psk_len; int ret = 0; @@ -1071,12 +1019,11 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); if (!ctrl->opts) return 0; - ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, - &ctrl->host_key); + ret = nvme_auth_parse_key(ctrl->opts->dhchap_secret, &ctrl->host_key); if (ret) return ret; - ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, - &ctrl->ctrl_key); + ret = nvme_auth_parse_key(ctrl->opts->dhchap_ctrl_secret, + &ctrl->ctrl_key); if (ret) goto err_free_dhchap_secret; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 766e9cc4ffca..1e33af94c24b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1875,6 +1875,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, break; } + bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE; bi->metadata_size = head->ms; if (bi->csum_type) { bi->pi_tuple_size = head->pi_size; @@ -1883,26 +1884,6 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, return true; } -static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) - lim->max_hw_discard_sectors = - nvme_lba_to_sect(ns->head, ctrl->dmrsl); - else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - lim->max_hw_discard_sectors = UINT_MAX; - else - lim->max_hw_discard_sectors = 0; - - lim->discard_granularity = lim->logical_block_size; - - if (ctrl->dmrl) - lim->max_discard_segments = ctrl->dmrl; - else - lim->max_discard_segments = NVME_DSM_MAX_RANGES; -} - static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && @@ -2078,12 +2059,15 @@ static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl, } static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, - struct queue_limits *lim) + struct nvme_id_ns_nvm *nvm, struct queue_limits *lim) { struct nvme_ns_head *head = ns->head; + struct nvme_ctrl *ctrl = ns->ctrl; u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; + u32 npdg = 1, npda = 1; bool valid = true; + u8 optperf; /* * The block layer can't support LBA sizes larger than the page size @@ -2098,7 +2082,12 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, phys_bs = bs; atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs); - if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { + optperf = id->nsfeat >> NVME_NS_FEAT_OPTPERF_SHIFT; + if (ctrl->vs >= NVME_VS(2, 1, 0)) + optperf &= NVME_NS_FEAT_OPTPERF_MASK_2_1; + else + optperf &= NVME_NS_FEAT_OPTPERF_MASK; + if (optperf) { /* NPWG = Namespace Preferred Write Granularity */ phys_bs = bs * (1 + le16_to_cpu(id->npwg)); /* NOWS = Namespace Optimal Write Size */ @@ -2115,11 +2104,54 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, lim->physical_block_size = min(phys_bs, atomic_bs); lim->io_min = phys_bs; lim->io_opt = io_opt; - if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) && - (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)) + if ((ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) && + (ctrl->oncs & NVME_CTRL_ONCS_DSM)) lim->max_write_zeroes_sectors = UINT_MAX; else - lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors; + lim->max_write_zeroes_sectors = ctrl->max_zeroes_sectors; + + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) + lim->max_hw_discard_sectors = + nvme_lba_to_sect(ns->head, ctrl->dmrsl); + else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) + lim->max_hw_discard_sectors = UINT_MAX; + else + lim->max_hw_discard_sectors = 0; + + /* + * NVMe namespaces advertise both a preferred deallocate granularity + * (for a discard length) and alignment (for a discard starting offset). + * However, Linux block devices advertise a single discard_granularity. + * From NVM Command Set specification 1.1 section 5.2.2, the NPDGL/NPDAL + * fields in the NVM Command Set Specific Identify Namespace structure + * are preferred to NPDG/NPDA in the Identify Namespace structure since + * they can represent larger values. However, NPDGL or NPDAL may be 0 if + * unsupported. NPDG and NPDA are 0's based. + * From Figure 115 of NVM Command Set specification 1.1, NPDGL and NPDAL + * are supported if the high bit of OPTPERF is set. NPDG is supported if + * the low bit of OPTPERF is set. NPDA is supported if either is set. + * NPDG should be a multiple of NPDA, and likewise NPDGL should be a + * multiple of NPDAL, but the spec doesn't say anything about NPDG vs. + * NPDAL or NPDGL vs. NPDA. So compute the maximum instead of assuming + * NPDG(L) is the larger. If neither NPDG, NPDGL, NPDA, nor NPDAL are + * supported, default the discard_granularity to the logical block size. + */ + if (optperf & 0x2 && nvm && nvm->npdgl) + npdg = le32_to_cpu(nvm->npdgl); + else if (optperf & 0x1) + npdg = from0based(id->npdg); + if (optperf & 0x2 && nvm && nvm->npdal) + npda = le32_to_cpu(nvm->npdal); + else if (optperf) + npda = from0based(id->npda); + if (check_mul_overflow(max(npdg, npda), lim->logical_block_size, + &lim->discard_granularity)) + lim->discard_granularity = lim->logical_block_size; + + if (ctrl->dmrl) + lim->max_discard_segments = ctrl->dmrl; + else + lim->max_discard_segments = NVME_DSM_MAX_RANGES; return valid; } @@ -2353,7 +2385,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, } lbaf = nvme_lbaf_index(id->flbas); - if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { + if (nvme_id_cns_ok(ns->ctrl, NVME_ID_CNS_CS_NS)) { ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); if (ret < 0) goto out; @@ -2381,10 +2413,9 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, nvme_set_ctrl_limits(ns->ctrl, &lim, false); nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info); nvme_set_chunk_sectors(ns, id, &lim); - if (!nvme_update_disk_info(ns, id, &lim)) + if (!nvme_update_disk_info(ns, id, nvm, &lim)) capacity = 0; - nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); @@ -3388,7 +3419,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) ctrl->dmrl = id->dmrl; ctrl->dmrsl = le32_to_cpu(id->dmrsl); - if (id->wzsl) + if (id->wzsl && !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); free_data: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index fc6800a9f7f9..ba00f0b72b85 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -154,21 +154,8 @@ void nvme_failover_req(struct request *req) } spin_lock_irqsave(&ns->head->requeue_lock, flags); - for (bio = req->bio; bio; bio = bio->bi_next) { + for (bio = req->bio; bio; bio = bio->bi_next) bio_set_dev(bio, ns->head->disk->part0); - if (bio->bi_opf & REQ_POLLED) { - bio->bi_opf &= ~REQ_POLLED; - bio->bi_cookie = BLK_QC_T_NONE; - } - /* - * The alternate request queue that we may end up submitting - * the bio to may be frozen temporarily, in this case REQ_NOWAIT - * will fail the I/O immediately with EAGAIN to the issuer. - * We are not in the issuer context which cannot block. Clear - * the flag to avoid spurious EAGAIN I/O failures. - */ - bio->bi_opf &= ~REQ_NOWAIT; - } blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9971045dbc05..ccd5e05dac98 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -762,6 +762,12 @@ static inline u32 nvme_bytes_to_numd(size_t len) return (len >> 2) - 1; } +/* Decode a 2-byte "0's based"/"0-based" field */ +static inline u32 from0based(__le16 value) +{ + return (u32)le16_to_cpu(value) + 1; +} + static inline bool nvme_is_ana_error(u16 status) { switch (status & NVME_SCT_SC_MASK) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b78ba239c8ea..db5fc9bf6627 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4178,6 +4178,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x502F), /* KINGSTON OM3SGP4xxxxK NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1f40, 0x1202), /* Netac Technologies Co. NV3000 NVMe SSD */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1f40, 0x5236), /* Netac Technologies Co. NV7000 NVMe SSD */ diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 16c6fea4b2db..7bf2e972126b 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -658,7 +658,7 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, struct nvme_dhchap_key *key, *host_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &key); + ret = nvme_auth_parse_key(dhchap_secret, &key); if (ret) { kfree(dhchap_secret); return ret; @@ -716,7 +716,7 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, struct nvme_dhchap_key *key, *ctrl_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &key); + ret = nvme_auth_parse_key(dhchap_secret, &key); if (ret) { kfree(dhchap_secret); return ret; @@ -829,7 +829,49 @@ static ssize_t tls_configured_key_show(struct device *dev, return sysfs_emit(buf, "%08x\n", key_serial(key)); } -static DEVICE_ATTR_RO(tls_configured_key); + +static ssize_t tls_configured_key_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int error, qid; + + error = kstrtoint(buf, 10, &qid); + if (error) + return error; + + /* + * We currently only allow userspace to write a `0` indicating + * generate a new key. + */ + if (qid) + return -EINVAL; + + if (!ctrl->opts || !ctrl->opts->concat) + return -EOPNOTSUPP; + + error = nvme_auth_negotiate(ctrl, 0); + if (error < 0) { + nvme_reset_ctrl(ctrl); + return error; + } + + error = nvme_auth_wait(ctrl, 0); + if (error < 0) { + nvme_reset_ctrl(ctrl); + return error; + } + + /* + * We need to reset the TLS connection, so let's just + * reset the controller. + */ + nvme_reset_ctrl(ctrl); + + return count; +} +static DEVICE_ATTR_RW(tls_configured_key); static ssize_t tls_keyring_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -861,7 +903,7 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, !ctrl->opts->tls && !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_configured_key.attr && - (!ctrl->opts->tls_key || ctrl->opts->concat)) + !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_keyring.attr && !ctrl->opts->keyring) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index ca5b08ce1211..e4fd1caadfb0 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1057,6 +1057,8 @@ static void nvme_execute_identify_ns_nvm(struct nvmet_req *req) status = NVME_SC_INTERNAL; goto out; } + if (req->ns->bdev) + nvmet_bdev_set_nvm_limits(req->ns->bdev, id); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); kfree(id); out: @@ -1603,7 +1605,7 @@ void nvmet_execute_keep_alive(struct nvmet_req *req) pr_debug("ctrl %d update keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + mod_delayed_work(system_percpu_wq, &ctrl->ka_work, ctrl->kato * HZ); out: nvmet_req_complete(req, status); } diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 2eadeb7e06f2..b34610e2f19d 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -45,15 +44,6 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, key_hash); return -EINVAL; } - if (key_hash > 0) { - /* Validate selected hash algorithm */ - const char *hmac = nvme_auth_hmac_name(key_hash); - - if (!crypto_has_shash(hmac, 0, 0)) { - pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac); - return -ENOTSUPP; - } - } dhchap_secret = kstrdup(secret, GFP_KERNEL); if (!dhchap_secret) return -ENOMEM; @@ -140,7 +130,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) return ret; } -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) { int ret = 0; struct nvmet_host_link *p; @@ -166,7 +156,7 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) goto out_unlock; } - if (nvmet_queue_tls_keyid(sq)) { + if (!reset && nvmet_queue_tls_keyid(sq)) { pr_debug("host %s tls enabled\n", ctrl->hostnqn); goto out_unlock; } @@ -292,47 +282,30 @@ bool nvmet_check_auth_status(struct nvmet_req *req) int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int shash_len) { - struct crypto_shash *shash_tfm; - SHASH_DESC_ON_STACK(shash, shash_tfm); + struct nvme_auth_hmac_ctx hmac; struct nvmet_ctrl *ctrl = req->sq->ctrl; - const char *hash_name; u8 *challenge = req->sq->dhchap_c1; struct nvme_dhchap_key *transformed_key; u8 buf[4]; int ret; - hash_name = nvme_auth_hmac_name(ctrl->shash_id); - if (!hash_name) { - pr_warn("Hash ID %d invalid\n", ctrl->shash_id); - return -EINVAL; - } - - shash_tfm = crypto_alloc_shash(hash_name, 0, 0); - if (IS_ERR(shash_tfm)) { - pr_err("failed to allocate shash %s\n", hash_name); - return PTR_ERR(shash_tfm); - } - - if (shash_len != crypto_shash_digestsize(shash_tfm)) { - pr_err("%s: hash len mismatch (len %d digest %d)\n", - __func__, shash_len, - crypto_shash_digestsize(shash_tfm)); - ret = -EINVAL; - goto out_free_tfm; - } - transformed_key = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn); - if (IS_ERR(transformed_key)) { - ret = PTR_ERR(transformed_key); - goto out_free_tfm; - } + if (IS_ERR(transformed_key)) + return PTR_ERR(transformed_key); - ret = crypto_shash_setkey(shash_tfm, transformed_key->key, + ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key, transformed_key->len); if (ret) goto out_free_response; + if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) { + pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__, + shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id)); + ret = -EINVAL; + goto out_free_response; + } + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { challenge = kmalloc(shash_len, GFP_KERNEL); if (!challenge) { @@ -345,101 +318,67 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, req->sq->dhchap_c1, challenge, shash_len); if (ret) - goto out; + goto out_free_challenge; } pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, req->sq->dhchap_tid); - shash->tfm = shash_tfm; - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, shash_len); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, challenge, shash_len); + put_unaligned_le32(req->sq->dhchap_s1, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(req->sq->dhchap_tid, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + *buf = req->sq->sc_c; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "HostHost", 8); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "HostHost", 8); memset(buf, 0, 4); - ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); - if (ret) - goto out; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn, - strlen(ctrl->subsys->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, response); -out: + nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn)); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn, + strlen(ctrl->subsys->subsysnqn)); + nvme_auth_hmac_final(&hmac, response); + ret = 0; +out_free_challenge: if (challenge != req->sq->dhchap_c1) kfree(challenge); out_free_response: + memzero_explicit(&hmac, sizeof(hmac)); nvme_auth_free_key(transformed_key); -out_free_tfm: - crypto_free_shash(shash_tfm); return ret; } int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, unsigned int shash_len) { - struct crypto_shash *shash_tfm; - struct shash_desc *shash; + struct nvme_auth_hmac_ctx hmac; struct nvmet_ctrl *ctrl = req->sq->ctrl; - const char *hash_name; u8 *challenge = req->sq->dhchap_c2; struct nvme_dhchap_key *transformed_key; u8 buf[4]; int ret; - hash_name = nvme_auth_hmac_name(ctrl->shash_id); - if (!hash_name) { - pr_warn("Hash ID %d invalid\n", ctrl->shash_id); - return -EINVAL; - } - - shash_tfm = crypto_alloc_shash(hash_name, 0, 0); - if (IS_ERR(shash_tfm)) { - pr_err("failed to allocate shash %s\n", hash_name); - return PTR_ERR(shash_tfm); - } - - if (shash_len != crypto_shash_digestsize(shash_tfm)) { - pr_debug("%s: hash len mismatch (len %d digest %d)\n", - __func__, shash_len, - crypto_shash_digestsize(shash_tfm)); - ret = -EINVAL; - goto out_free_tfm; - } - transformed_key = nvme_auth_transform_key(ctrl->ctrl_key, ctrl->subsys->subsysnqn); - if (IS_ERR(transformed_key)) { - ret = PTR_ERR(transformed_key); - goto out_free_tfm; - } + if (IS_ERR(transformed_key)) + return PTR_ERR(transformed_key); - ret = crypto_shash_setkey(shash_tfm, transformed_key->key, + ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key, transformed_key->len); if (ret) goto out_free_response; + if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) { + pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__, + shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id)); + ret = -EINVAL; + goto out_free_response; + } + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { challenge = kmalloc(shash_len, GFP_KERNEL); if (!challenge) { @@ -455,55 +394,29 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, goto out_free_challenge; } - shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), - GFP_KERNEL); - if (!shash) { - ret = -ENOMEM; - goto out_free_challenge; - } - shash->tfm = shash_tfm; + nvme_auth_hmac_update(&hmac, challenge, shash_len); - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, shash_len); - if (ret) - goto out; put_unaligned_le32(req->sq->dhchap_s2, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(req->sq->dhchap_tid, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + memset(buf, 0, 4); - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "Controller", 10); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn, - strlen(ctrl->subsys->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, response); -out: - kfree(shash); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "Controller", 10); + nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn, + strlen(ctrl->subsys->subsysnqn)); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn)); + nvme_auth_hmac_final(&hmac, response); + ret = 0; out_free_challenge: if (challenge != req->sq->dhchap_c2) kfree(challenge); out_free_response: + memzero_explicit(&hmac, sizeof(hmac)); nvme_auth_free_key(transformed_key); -out_free_tfm: - crypto_free_shash(shash_tfm); return ret; } @@ -531,7 +444,7 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req, } int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, - u8 *pkey, int pkey_size) + const u8 *pkey, int pkey_size) { struct nvmet_ctrl *ctrl = req->sq->ctrl; int ret; @@ -557,7 +470,8 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, void nvmet_auth_insert_psk(struct nvmet_sq *sq) { int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id); - u8 *psk, *digest, *tls_psk; + u8 *psk, *tls_psk; + char *digest; size_t psk_len; int ret; #ifdef CONFIG_NVME_TARGET_TCP_TLS diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 3088e044dbcb..463348c7f097 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -17,7 +17,6 @@ #include #endif #include -#include #include #include @@ -2181,8 +2180,6 @@ static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item, hmac_id = nvme_auth_hmac_id(page); if (hmac_id == NVME_AUTH_HASH_INVALID) return -EINVAL; - if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0)) - return -ENOTSUPP; host->dhchap_hash_id = hmac_id; return count; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 9238e13bd480..45f686175fea 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1688,7 +1688,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) if (args->hostid) uuid_copy(&ctrl->hostid, args->hostid); - dhchap_status = nvmet_setup_auth(ctrl, args->sq); + dhchap_status = nvmet_setup_auth(ctrl, args->sq, false); if (dhchap_status) { pr_err("Failed to setup authentication, dhchap status %u\n", dhchap_status); @@ -1944,12 +1944,13 @@ static int __init nvmet_init(void) if (!nvmet_bvec_cache) return -ENOMEM; - zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); + zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!zbd_wq) goto out_destroy_bvec_cache; buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!buffered_io_wq) goto out_free_zbd_work_queue; diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 5946681cb0e3..b9ab80c7a694 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include "nvmet.h" @@ -75,8 +74,7 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) { u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i]; - if (!fallback_hash_id && - crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0)) + if (!fallback_hash_id && nvme_auth_hmac_hash_len(host_hmac_id)) fallback_hash_id = host_hmac_id; if (ctrl->shash_id != host_hmac_id) continue; @@ -293,7 +291,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req) pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__, ctrl->cntlid, req->sq->qid); if (!req->sq->qid) { - dhchap_status = nvmet_setup_auth(ctrl, req->sq); + dhchap_status = nvmet_setup_auth(ctrl, req->sq, + true); if (dhchap_status) { pr_err("ctrl %d qid 0 failed to setup re-authentication\n", ctrl->cntlid); @@ -391,14 +390,15 @@ done: req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120; - mod_delayed_work(system_wq, &req->sq->auth_expired_work, + mod_delayed_work(system_percpu_wq, &req->sq->auth_expired_work, auth_expire_secs * HZ); goto complete; } /* Final states, clear up variables */ - nvmet_auth_sq_free(req->sq); - if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { + nvmet_auth_sq_free(req->sq); nvmet_ctrl_fatal_error(ctrl); + } complete: nvmet_req_complete(req, status); @@ -574,9 +574,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req) status = nvmet_copy_to_sgl(req, 0, d, al); kfree(d); done: - if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2) - nvmet_auth_sq_free(req->sq); - else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { nvmet_auth_sq_free(req->sq); nvmet_ctrl_fatal_error(ctrl); } diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 4eaadc711c99..d161707559ce 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -792,9 +792,9 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, if (!queue) return NULL; - queue->work_q = alloc_workqueue("ntfc%d.%d.%d", 0, 0, - assoc->tgtport->fc_target_port.port_num, - assoc->a_id, qid); + queue->work_q = alloc_workqueue("ntfc%d.%d.%d", WQ_PERCPU, 0, + assoc->tgtport->fc_target_port.port_num, + assoc->a_id, qid); if (!queue->work_q) goto out_free_queue; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index f15d1c213bc6..f2d9e8901df4 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -30,11 +30,11 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) id->nacwu = lpp0b; /* - * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and - * NOWS are defined for this namespace and should be used by - * the host for I/O optimization. + * OPTPERF = 11b indicates that the fields NPWG, NPWA, NPDG, NPDA, + * NPDGL, NPDAL, and NOWS are defined for this namespace and should be + * used by the host for I/O optimization. */ - id->nsfeat |= 1 << 4; + id->nsfeat |= 0x3 << NVME_NS_FEAT_OPTPERF_SHIFT; /* NPWG = Namespace Preferred Write Granularity. 0's based */ id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev)); /* NPWA = Namespace Preferred Write Alignment. 0's based */ @@ -52,6 +52,17 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) id->dlfeat = (1 << 3) | 0x1; } +void nvmet_bdev_set_nvm_limits(struct block_device *bdev, + struct nvme_id_ns_nvm *id) +{ + /* + * NPDGL = Namespace Preferred Deallocate Granularity Large + * NPDAL = Namespace Preferred Deallocate Alignment Large + */ + id->npdgl = id->npdal = cpu_to_le32(bdev_discard_granularity(bdev) / + bdev_logical_block_size(bdev)); +} + void nvmet_bdev_ns_disable(struct nvmet_ns *ns) { if (ns->bdev_file) { diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4b3f4f11928d..d98d0cdc5d6f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -419,7 +419,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { if (ctrl->ctrl.queue_count > 1) { nvme_quiesce_io_queues(&ctrl->ctrl); - nvme_cancel_tagset(&ctrl->ctrl); nvme_loop_destroy_io_queues(ctrl); } @@ -427,7 +426,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE) nvme_disable_ctrl(&ctrl->ctrl, true); - nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_loop_destroy_admin_queue(ctrl); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 319d6a5e9cf0..50070cfb782a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -550,6 +550,8 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl); u16 nvmet_parse_connect_cmd(struct nvmet_req *req); u32 nvmet_connect_cmd_data_len(struct nvmet_req *req); void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); +void nvmet_bdev_set_nvm_limits(struct block_device *bdev, + struct nvme_id_ns_nvm *id); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req); @@ -896,7 +898,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req); int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, bool set_ctrl); int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq); +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset); void nvmet_auth_sq_init(struct nvmet_sq *sq); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_auth_sq_free(struct nvmet_sq *sq); @@ -913,11 +915,11 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) int nvmet_auth_ctrl_exponential(struct nvmet_req *req, u8 *buf, int buf_size); int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, - u8 *buf, int buf_size); + const u8 *pkey, int pkey_size); void nvmet_auth_insert_psk(struct nvmet_sq *sq); #else static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, - struct nvmet_sq *sq) + struct nvmet_sq *sq, bool reset) { return 0; } diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index acc71a26733f..4b8b02341ddc 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -2225,7 +2225,7 @@ static int __init nvmet_tcp_init(void) int ret; nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU, 0); if (!nvmet_tcp_wq) return -ENOMEM; diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index a9a9ec086a7e..e80dec53174e 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -9,6 +10,178 @@ #define uptr64(val) ((void __user *)(uintptr_t)(val)) +/* + * Per-command BSG SCSI PDU stored in io_uring_cmd.pdu[32]. + * Holds temporary state between submission, completion and task_work. + */ +struct scsi_bsg_uring_cmd_pdu { + struct bio *bio; /* mapped user buffer, unmap in task work */ + struct request *req; /* block request, freed in task work */ + u64 response_addr; /* user space response buffer address */ +}; +static_assert(sizeof(struct scsi_bsg_uring_cmd_pdu) <= sizeof_field(struct io_uring_cmd, pdu)); + +static inline struct scsi_bsg_uring_cmd_pdu *scsi_bsg_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return io_uring_cmd_to_pdu(ioucmd, struct scsi_bsg_uring_cmd_pdu); +} + +/* Task work: build res2 (layout in uapi/linux/bsg.h) and copy sense to user. */ +static void scsi_bsg_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) +{ + struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req); + struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd); + struct request *rq = pdu->req; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); + u64 res2; + int ret = 0; + u8 driver_status = 0; + u8 sense_len_wr = 0; + + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + + if (scsi_status_is_check_condition(scmd->result)) { + driver_status = DRIVER_SENSE; + if (pdu->response_addr) + sense_len_wr = min_t(u8, scmd->sense_len, + SCSI_SENSE_BUFFERSIZE); + } + + if (sense_len_wr) { + if (copy_to_user(uptr64(pdu->response_addr), scmd->sense_buffer, + sense_len_wr)) + ret = -EFAULT; + } + + res2 = bsg_scsi_res2_build(status_byte(scmd->result), driver_status, + host_byte(scmd->result), sense_len_wr, + scmd->resid_len); + + blk_mq_free_request(rq); + io_uring_cmd_done32(ioucmd, ret, res2, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); +} + +static enum rq_end_io_ret scsi_bsg_uring_cmd_done(struct request *req, + blk_status_t status, + const struct io_comp_batch *iocb) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + + io_uring_cmd_do_in_task_lazy(ioucmd, scsi_bsg_uring_task_cb); + return RQ_END_IO_NONE; +} + +static int scsi_bsg_map_user_buffer(struct request *req, + struct io_uring_cmd *ioucmd, + unsigned int issue_flags, gfp_t gfp_mask) +{ + const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd); + bool is_write = cmd->dout_xfer_len > 0; + u64 buf_addr = is_write ? cmd->dout_xferp : cmd->din_xferp; + unsigned long buf_len = is_write ? cmd->dout_xfer_len : cmd->din_xfer_len; + struct iov_iter iter; + int ret; + + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + ret = io_uring_cmd_import_fixed(buf_addr, buf_len, + is_write ? WRITE : READ, + &iter, ioucmd, issue_flags); + if (ret < 0) + return ret; + ret = blk_rq_map_user_iov(req->q, req, NULL, &iter, gfp_mask); + } else { + ret = blk_rq_map_user(req->q, req, NULL, uptr64(buf_addr), + buf_len, gfp_mask); + } + + return ret; +} + +static int scsi_bsg_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd, + unsigned int issue_flags, bool open_for_write) +{ + struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd); + const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd); + struct scsi_cmnd *scmd; + struct request *req; + blk_mq_req_flags_t blk_flags = 0; + gfp_t gfp_mask = GFP_KERNEL; + int ret; + + if (cmd->protocol != BSG_PROTOCOL_SCSI || + cmd->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD) + return -EINVAL; + + if (!cmd->request || cmd->request_len == 0) + return -EINVAL; + + if (cmd->dout_xfer_len && cmd->din_xfer_len) { + pr_warn_once("BIDI support in bsg has been removed.\n"); + return -EOPNOTSUPP; + } + + if (cmd->dout_iovec_count > 0 || cmd->din_iovec_count > 0) + return -EOPNOTSUPP; + + if (issue_flags & IO_URING_F_NONBLOCK) { + blk_flags = BLK_MQ_REQ_NOWAIT; + gfp_mask = GFP_NOWAIT; + } + + req = scsi_alloc_request(q, cmd->dout_xfer_len ? + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, blk_flags); + if (IS_ERR(req)) + return PTR_ERR(req); + + scmd = blk_mq_rq_to_pdu(req); + if (cmd->request_len > sizeof(scmd->cmnd)) { + ret = -EINVAL; + goto out_free_req; + } + scmd->cmd_len = cmd->request_len; + scmd->allowed = SG_DEFAULT_RETRIES; + + if (copy_from_user(scmd->cmnd, uptr64(cmd->request), cmd->request_len)) { + ret = -EFAULT; + goto out_free_req; + } + + if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) { + ret = -EPERM; + goto out_free_req; + } + + pdu->response_addr = cmd->response; + scmd->sense_len = cmd->max_response_len ? + min(cmd->max_response_len, SCSI_SENSE_BUFFERSIZE) : SCSI_SENSE_BUFFERSIZE; + + if (cmd->dout_xfer_len || cmd->din_xfer_len) { + ret = scsi_bsg_map_user_buffer(req, ioucmd, issue_flags, gfp_mask); + if (ret) + goto out_free_req; + pdu->bio = req->bio; + } else { + pdu->bio = NULL; + } + + req->timeout = cmd->timeout_ms ? + msecs_to_jiffies(cmd->timeout_ms) : BLK_DEFAULT_SG_TIMEOUT; + + req->end_io = scsi_bsg_uring_cmd_done; + req->end_io_data = ioucmd; + pdu->req = req; + + blk_execute_rq_nowait(req, false); + return -EIOCBQUEUED; + +out_free_req: + blk_mq_free_request(req); + return ret; +} + static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, bool open_for_write, unsigned int timeout) { @@ -99,5 +272,6 @@ out_put_request: struct bsg_device *scsi_bsg_register_queue(struct scsi_device *sdev) { return bsg_register_queue(sdev->request_queue, &sdev->sdev_gendev, - dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn); + dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn, + scsi_bsg_uring_cmd); } diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 3d593af30aa5..62ced9f5102f 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -173,7 +173,7 @@ static int fd_configure_device(struct se_device *dev) */ dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bdev)) + if (!bdev_rot(bdev)) dev->dev_attrib.is_nonrot = 1; } else { if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) { diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3c92f94497b4..1087d1d17c36 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -148,7 +148,7 @@ static int iblock_configure_device(struct se_device *dev) else dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bd)) + if (!bdev_rot(bd)) dev->dev_attrib.is_nonrot = 1; target_configure_write_atomic_from_bdev(&dev->dev_attrib, bd); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b8e810a35ce..0af16946dcda 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -694,7 +694,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(file_bdev(bdev_file))) + if (bdev_rot(file_bdev(bdev_file))) fs_devices->rotating = true; if (bdev_max_discard_sectors(file_bdev(bdev_file))) @@ -2919,7 +2919,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(device->bdev)) + if (bdev_rot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 6f5bfbb0e8a4..c75b91ae0cf0 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -73,7 +73,7 @@ static int mbt_mb_init(struct super_block *sb) ext4_fsblk_t block; int ret; - /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + /* needed by ext4_mb_init->bdev_rot(sb->s_bdev) */ sb->s_bdev = kzalloc_obj(*sb->s_bdev); if (sb->s_bdev == NULL) return -ENOMEM; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ea0c1c27cb8c..a44514a3f5fe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3840,7 +3840,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } - if (bdev_nonrot(sb->s_bdev)) + if (!bdev_rot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 309f70098524..b2626a482563 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -670,7 +670,6 @@ xfs_zone_gc_start_chunk( struct xfs_inode *ip; struct bio *bio; xfs_daddr_t daddr; - unsigned int len; bool is_seq; if (xfs_is_shutdown(mp)) @@ -685,15 +684,16 @@ xfs_zone_gc_start_chunk( return false; } - len = XFS_FSB_TO_B(mp, irec.rm_blockcount); - bio = bio_alloc_bioset(bdev, - min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS), - REQ_OP_READ, GFP_NOFS, &data->bio_set); - + /* + * Scratch allocation can wrap around to the same buffer again, + * provision an extra bvec for that case. + */ + bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS, + &data->bio_set); chunk = container_of(bio, struct xfs_gc_bio, bio); chunk->ip = ip; chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); - chunk->len = len; + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); chunk->old_startblock = xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); chunk->new_daddr = daddr; @@ -707,8 +707,9 @@ xfs_zone_gc_start_chunk( bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); bio->bi_end_io = xfs_zone_gc_end_io; xfs_zone_gc_add_data(chunk); - data->scratch_head = (data->scratch_head + len) % data->scratch_size; - data->scratch_available -= len; + data->scratch_head = + (data->scratch_head + chunk->len) % data->scratch_size; + data->scratch_available -= chunk->len; XFS_STATS_INC(mp, xs_gc_read_calls); @@ -899,9 +900,10 @@ out: static void xfs_submit_zone_reset_bio( - struct xfs_rtgroup *rtg, - struct bio *bio) + struct bio *bio, + void *priv) { + struct xfs_rtgroup *rtg = priv; struct xfs_mount *mp = rtg_mount(rtg); trace_xfs_zone_reset(rtg); @@ -933,26 +935,16 @@ xfs_submit_zone_reset_bio( submit_bio(bio); } -static void xfs_bio_wait_endio(struct bio *bio) -{ - complete(bio->bi_private); -} - int xfs_zone_gc_reset_sync( struct xfs_rtgroup *rtg) { - DECLARE_COMPLETION_ONSTACK(done); struct bio bio; int error; bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, REQ_OP_ZONE_RESET | REQ_SYNC); - bio.bi_private = &done; - bio.bi_end_io = xfs_bio_wait_endio; - xfs_submit_zone_reset_bio(rtg, &bio); - wait_for_completion_io(&done); - + bio_await(&bio, rtg, xfs_submit_zone_reset_bio); error = blk_status_to_errno(bio.bi_status); bio_uninit(&bio); return error; @@ -989,7 +981,7 @@ xfs_zone_gc_reset_zones( chunk->data = data; WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); list_add_tail(&chunk->entry, &data->resetting); - xfs_submit_zone_reset_bio(rtg, bio); + xfs_submit_zone_reset_bio(bio, rtg); } while (next); } diff --git a/include/crypto/hkdf.h b/include/crypto/hkdf.h deleted file mode 100644 index 6a9678f508f5..000000000000 --- a/include/crypto/hkdf.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * HKDF: HMAC-based Key Derivation Function (HKDF), RFC 5869 - * - * Extracted from fs/crypto/hkdf.c, which has - * Copyright 2019 Google LLC - */ - -#ifndef _CRYPTO_HKDF_H -#define _CRYPTO_HKDF_H - -#include - -int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, const u8 *salt, unsigned int saltlen, - u8 *prk); -int hkdf_expand(struct crypto_shash *hmac_tfm, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); -#endif diff --git a/include/linux/bio.h b/include/linux/bio.h index 9693a0d6fefe..97d747320b35 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -350,8 +350,7 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs); + blk_opf_t opf, gfp_t gfp, struct bio_set *bs); struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); @@ -433,6 +432,8 @@ extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_reuse(struct bio *bio, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); +void bio_await(struct bio *bio, void *priv, + void (*submit)(struct bio *bio, void *priv)); int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index ea6d7d322ae3..b1b530613c34 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -14,6 +14,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, BLK_INTEGRITY_STACKED = 1 << 4, + BLK_SPLIT_INTERVAL_CAPABLE = 1 << 5, }; const char *blk_integrity_profile_name(struct blk_integrity *bi); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b5e2f3c5e5c0..890128cdea1c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -201,10 +202,14 @@ struct gendisk { u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; - spinlock_t zone_wplugs_lock; + spinlock_t zone_wplugs_hash_lock; struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; + spinlock_t zone_wplugs_list_lock; + struct list_head zone_wplugs_list; + struct task_struct *zone_wplugs_worker; + struct completion zone_wplugs_worker_bio_done; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -503,7 +508,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues); struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; @@ -669,6 +674,7 @@ enum { QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */ QUEUE_FLAG_MAX }; @@ -708,6 +714,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) #define blk_queue_no_elv_switch(q) \ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) +#define blk_queue_zoned_qd1_writes(q) \ + test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); @@ -1468,11 +1476,6 @@ static inline bool bdev_rot(struct block_device *bdev) return blk_queue_rot(bdev_get_queue(bdev)); } -static inline bool bdev_nonrot(struct block_device *bdev) -{ - return !bdev_rot(bdev); -} - static inline bool bdev_synchronous(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; diff --git a/include/linux/bsg.h b/include/linux/bsg.h index ee2df73edf83..162730bfc2d8 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -7,13 +7,17 @@ struct bsg_device; struct device; struct request_queue; +struct io_uring_cmd; typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr, bool open_for_write, unsigned int timeout); +typedef int (bsg_uring_cmd_fn)(struct request_queue *q, struct io_uring_cmd *ioucmd, + unsigned int issue_flags, bool open_for_write); + struct bsg_device *bsg_register_queue(struct request_queue *q, struct device *parent, const char *name, - bsg_sg_io_fn *sg_io_fn); + bsg_sg_io_fn *sg_io_fn, bsg_uring_cmd_fn *uring_cmd_fn); void bsg_unregister_queue(struct bsg_device *bcd); #endif /* _LINUX_BSG_H */ diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 06fb60471aaf..d36dd476feda 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -203,15 +203,6 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) -/* for iterating one bio from start to end */ -#define BVEC_ITER_ALL_INIT (struct bvec_iter) \ -{ \ - .bi_sector = 0, \ - .bi_size = UINT_MAX, \ - .bi_idx = 0, \ - .bi_bvec_done = 0, \ -} - static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { iter_all->done = 0; diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 53f44b8cd75f..f53c534aba0c 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -87,7 +87,7 @@ */ GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, /* "arbitrary" size strings, nla_policy.len = 0 */ - __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) + __str_field(1, 0, info_text, 0) ) /* Configuration requests typically need a context to operate on. @@ -96,10 +96,10 @@ GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, * and/or the replication group (aka resource) name, * and the volume id within the resource. */ GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, - __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) - __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) - __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) - __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) + __u32_field(1, 0, ctx_volume) + __str_field(2, 0, ctx_resource_name, 128) + __bin_field(3, 0, ctx_my_addr, 128) + __bin_field(4, 0, ctx_peer_addr, 128) ) GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, @@ -108,86 +108,86 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) /* use the resize command to try and change the disk_size */ - __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) + __u64_field(4, DRBD_F_INVARIANT, disk_size) /* we could change the max_bio_bvecs, * but it won't propagate through the stack */ - __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) + __u32_field(5, DRBD_F_INVARIANT, max_bio_bvecs) - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) + __u32_field_def(6, 0, on_io_error, DRBD_ON_IO_ERROR_DEF) + __u32_field_def(7, 0, fencing, DRBD_FENCING_DEF) - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) - __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) + __u32_field_def(8, 0, resync_rate, DRBD_RESYNC_RATE_DEF) + __s32_field_def(9, 0, resync_after, DRBD_MINOR_NUMBER_DEF) + __u32_field_def(10, 0, al_extents, DRBD_AL_EXTENTS_DEF) + __u32_field_def(11, 0, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) + __u32_field_def(12, 0, c_delay_target, DRBD_C_DELAY_TARGET_DEF) + __u32_field_def(13, 0, c_fill_target, DRBD_C_FILL_TARGET_DEF) + __u32_field_def(14, 0, c_max_rate, DRBD_C_MAX_RATE_DEF) + __u32_field_def(15, 0, c_min_rate, DRBD_C_MIN_RATE_DEF) + __u32_field_def(20, 0, disk_timeout, DRBD_DISK_TIMEOUT_DEF) __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF) - __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) - __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) - __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) - __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) + __flg_field_def(16, 0, disk_barrier, DRBD_DISK_BARRIER_DEF) + __flg_field_def(17, 0, disk_flushes, DRBD_DISK_FLUSHES_DEF) + __flg_field_def(18, 0, disk_drain, DRBD_DISK_DRAIN_DEF) + __flg_field_def(19, 0, md_flushes, DRBD_MD_FLUSHES_DEF) __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF) __flg_field_def(26, 0 /* OPTIONAL */, disable_write_same, DRBD_DISABLE_WRITE_SAME_DEF) ) GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, - __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, DRBD_CPU_MASK_SIZE) - __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) + __str_field_def(1, 0, cpu_mask, DRBD_CPU_MASK_SIZE) + __u32_field_def(2, 0, on_no_data, DRBD_ON_NO_DATA_DEF) ) GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, - __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, + __str_field_def(1, DRBD_F_SENSITIVE, shared_secret, SHARED_SECRET_MAX) - __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) - __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) - __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) - __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) - __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) - __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) - __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) - __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) - __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) - __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) - __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) - __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) - __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) - __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) - __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) - __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) - __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) - __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) - /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ - /* 9: __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */ + __str_field_def(2, 0, cram_hmac_alg, SHARED_SECRET_MAX) + __str_field_def(3, 0, integrity_alg, SHARED_SECRET_MAX) + __str_field_def(4, 0, verify_alg, SHARED_SECRET_MAX) + __str_field_def(5, 0, csums_alg, SHARED_SECRET_MAX) + __u32_field_def(6, 0, wire_protocol, DRBD_PROTOCOL_DEF) + __u32_field_def(7, 0, connect_int, DRBD_CONNECT_INT_DEF) + __u32_field_def(8, 0, timeout, DRBD_TIMEOUT_DEF) + __u32_field_def(9, 0, ping_int, DRBD_PING_INT_DEF) + __u32_field_def(10, 0, ping_timeo, DRBD_PING_TIMEO_DEF) + __u32_field_def(11, 0, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) + __u32_field_def(12, 0, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) + __u32_field_def(13, 0, ko_count, DRBD_KO_COUNT_DEF) + __u32_field_def(14, 0, max_buffers, DRBD_MAX_BUFFERS_DEF) + __u32_field_def(15, 0, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) + __u32_field_def(16, 0, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) + __u32_field_def(17, 0, after_sb_0p, DRBD_AFTER_SB_0P_DEF) + __u32_field_def(18, 0, after_sb_1p, DRBD_AFTER_SB_1P_DEF) + __u32_field_def(19, 0, after_sb_2p, DRBD_AFTER_SB_2P_DEF) + __u32_field_def(20, 0, rr_conflict, DRBD_RR_CONFLICT_DEF) + __u32_field_def(21, 0, on_congestion, DRBD_ON_CONGESTION_DEF) + __u32_field_def(22, 0, cong_fill, DRBD_CONG_FILL_DEF) + __u32_field_def(23, 0, cong_extents, DRBD_CONG_EXTENTS_DEF) + __flg_field_def(24, 0, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) + __flg_field(25, DRBD_F_INVARIANT, discard_my_data) + __flg_field_def(26, 0, tcp_cork, DRBD_TCP_CORK_DEF) + __flg_field_def(27, 0, always_asbp, DRBD_ALWAYS_ASBP_DEF) + __flg_field(28, DRBD_F_INVARIANT, tentative) + __flg_field_def(29, 0, use_rle, DRBD_USE_RLE_DEF) + /* 9: __u32_field_def(30, 0, fencing_policy, DRBD_FENCING_DEF) */ + /* 9: __str_field_def(31, 0, name, SHARED_SECRET_MAX) */ /* 9: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */ __flg_field_def(33, 0 /* OPTIONAL */, csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF) __u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF) ) GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) + __flg_field(1, 0, assume_uptodate) ) GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) - __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) - __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) + __u64_field(1, 0, resize_size) + __flg_field(2, 0, resize_force) + __flg_field(3, 0, no_resync) __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF) __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF) ) @@ -195,31 +195,31 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, /* the reason of the broadcast, * if this is an event triggered broadcast. */ - __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) + __u32_field(1, 0, sib_reason) __u32_field(2, DRBD_F_REQUIRED, current_state) - __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) - __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) + __u64_field(3, 0, capacity) + __u64_field(4, 0, ed_uuid) /* These are for broadcast from after state change work. * prev_state and new_state are from the moment the state change took * place, new_state is not neccessarily the same as current_state, * there may have been more state changes since. Which will be * broadcasted soon, in their respective after state change work. */ - __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) - __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) + __u32_field(5, 0, prev_state) + __u32_field(6, 0, new_state) /* if we have a local disk: */ - __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) - __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) - __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) - __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) + __bin_field(7, 0, uuids, (UI_SIZE*sizeof(__u64))) + __u32_field(8, 0, disk_flags) + __u64_field(9, 0, bits_total) + __u64_field(10, 0, bits_oos) /* and in case resync or online verify is active */ - __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) - __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) + __u64_field(11, 0, bits_rs_total) + __u64_field(12, 0, bits_rs_failed) /* for pre and post notifications of helper execution */ - __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) - __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) + __str_field(13, 0, helper, 32) + __u32_field(14, 0, helper_exit_code) __u64_field(15, 0, send_cnt) __u64_field(16, 0, recv_cnt) @@ -233,12 +233,12 @@ GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, ) GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) - __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector) + __u64_field(1, 0, ov_start_sector) + __u64_field(2, 0, ov_stop_sector) ) GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) + __flg_field(1, 0, clear_bm) ) GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, @@ -246,11 +246,11 @@ GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, ) GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) + __flg_field(1, 0, force_disconnect) ) GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) + __flg_field(1, 0, force_detach) ) GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info, @@ -315,12 +315,12 @@ GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics, ) GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header, - __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type) + __u32_field(1, 0, nh_type) ) GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info, - __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32) - __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status) + __str_field(1, 0, helper_name, 32) + __u32_field(2, 0, helper_status) ) /* @@ -333,9 +333,9 @@ GENL_notification( DRBD_EVENT, 1, events, GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_NET_CONF, 0) + GENL_tla_expected(DRBD_NLA_DISK_CONF, 0) + GENL_tla_expected(DRBD_NLA_SYNCER_CONF, 0) ) /* query kernel for specific or all info */ @@ -349,7 +349,7 @@ GENL_op( ), /* To select the object .doit. * Or a subset of objects in .dumpit. */ - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) ) /* add DRBD minor devices as volumes to resources */ @@ -367,7 +367,7 @@ GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, GENL_doit(drbd_adm_resource_opts), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, 0) ) GENL_op( @@ -403,7 +403,7 @@ GENL_op( DRBD_ADM_RESIZE, 13, GENL_doit(drbd_adm_resize), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, 0) ) GENL_op( @@ -424,18 +424,18 @@ GENL_op( DRBD_ADM_NEW_C_UUID, 16, GENL_doit(drbd_adm_new_c_uuid), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, 0) ) GENL_op( DRBD_ADM_START_OV, 17, GENL_doit(drbd_adm_start_ov), - GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_START_OV_PARMS, 0) ) GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_DETACH_PARMS, 0)) GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) @@ -460,36 +460,36 @@ GENL_op(DRBD_ADM_GET_RESOURCES, 30, GENL_op_init( .dumpit = drbd_adm_dump_resources, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, 0) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_DEVICES, 31, GENL_op_init( .dumpit = drbd_adm_dump_devices, .done = drbd_adm_dump_devices_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, 0) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_CONNECTIONS, 32, GENL_op_init( .dumpit = drbd_adm_dump_connections, .done = drbd_adm_dump_connections_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, 0) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33, GENL_op_init( .dumpit = drbd_adm_dump_peer_devices, .done = drbd_adm_dump_peer_devices_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, 0) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, 0)) GENL_notification( DRBD_RESOURCE_STATE, 34, events, @@ -524,7 +524,7 @@ GENL_op( GENL_op_init( .dumpit = drbd_adm_get_initial_state, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0)) GENL_notification( DRBD_HELPER, 40, events, diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index d4da060b7532..a7d36c9ea924 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -149,7 +149,8 @@ static int __ ## s_name ## _from_attrs(struct s_name *s, \ if (!tla) \ return -ENOMSG; \ DPRINT_TLA(#s_name, "<=-", #tag_name); \ - err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ + err = nla_parse_nested_deprecated(ntb, maxtype, tla, \ + s_name ## _nl_policy, NULL); \ if (err) \ return err; \ \ @@ -292,6 +293,10 @@ static struct genl_family ZZZ_genl_family __ro_after_init = { #endif .maxattr = ARRAY_SIZE(CONCATENATE(GENL_MAGIC_FAMILY, _tla_nl_policy))-1, .policy = CONCATENATE(GENL_MAGIC_FAMILY, _tla_nl_policy), +#ifdef GENL_MAGIC_FAMILY_PRE_DOIT + .pre_doit = GENL_MAGIC_FAMILY_PRE_DOIT, + .post_doit = GENL_MAGIC_FAMILY_POST_DOIT, +#endif .ops = ZZZ_genl_ops, .n_ops = ARRAY_SIZE(ZZZ_genl_ops), .mcgrps = ZZZ_genl_mcgrps, diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index 621b87a87d74..2200cedd160a 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -25,16 +25,6 @@ extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void); * Extension of genl attribute validation policies {{{2 */ -/* - * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not - * know about. This flag can be set in nlattr->nla_type to indicate that this - * attribute must not be ignored. - * - * We check and remove this flag in drbd_nla_check_mandatory() before - * validating the attribute types and lengths via nla_parse_nested(). - */ -#define DRBD_GENLA_F_MANDATORY (1 << 14) - /* * Flags specific to drbd and not visible at the netlink layer, used in * _from_attrs and _to_skb: @@ -52,7 +42,6 @@ extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void); #define DRBD_F_SENSITIVE (1 << 1) #define DRBD_F_INVARIANT (1 << 2) -#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) /* }}}1 * MAGIC @@ -158,12 +147,12 @@ enum { \ #undef __field #define __field(attr_nr, attr_flag, name, nla_type, type, \ __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + T_ ## name = (__u16)(attr_nr), #undef __array #define __array(attr_nr, attr_flag, name, nla_type, type, \ maxlen, __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + T_ ## name = (__u16)(attr_nr), #include GENL_MAGIC_INCLUDE_FILE diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index e75c29c51464..682f81046345 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -7,6 +7,7 @@ #define _NVME_AUTH_H #include +#include struct nvme_dhchap_key { size_t len; @@ -20,32 +21,44 @@ const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id); u8 nvme_auth_dhgroup_id(const char *dhgroup_name); const char *nvme_auth_hmac_name(u8 hmac_id); -const char *nvme_auth_digest_name(u8 hmac_id); size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); +struct nvme_auth_hmac_ctx { + u8 hmac_id; + union { + struct hmac_sha256_ctx sha256; + struct hmac_sha384_ctx sha384; + struct hmac_sha512_ctx sha512; + }; +}; +int nvme_auth_hmac_init(struct nvme_auth_hmac_ctx *hmac, u8 hmac_id, + const u8 *key, size_t key_len); +void nvme_auth_hmac_update(struct nvme_auth_hmac_ctx *hmac, const u8 *data, + size_t data_len); +void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out); u32 nvme_auth_key_struct_size(u32 key_len); -struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, - u8 key_hash); +struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash); void nvme_auth_free_key(struct nvme_dhchap_key *key); struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash); struct nvme_dhchap_key *nvme_auth_transform_key( - struct nvme_dhchap_key *key, char *nqn); -int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key); -int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *challenge, u8 *aug, size_t hlen); + const struct nvme_dhchap_key *key, const char *nqn); +int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key); +int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *challenge, u8 *aug, size_t hlen); int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - u8 *ctrl_key, size_t ctrl_key_len, + const u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len); -int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *c1, u8 *c2, size_t hash_len, +int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); -int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, - char *subsysnqn, char *hostnqn, u8 **ret_digest); -int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, - u8 *psk_digest, u8 **ret_psk); +int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len, + const char *subsysnqn, const char *hostnqn, + char **ret_digest); +int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len, + const char *psk_digest, u8 **ret_psk); #endif /* _NVME_AUTH_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 655d194f8e72..041f30931a90 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -513,9 +513,16 @@ struct nvme_id_ns_nvm { __u8 pic; __u8 rsvd9[3]; __le32 elbaf[64]; - __u8 rsvd268[3828]; + __le32 npdgl; + __le32 nprg; + __le32 npra; + __le32 nors; + __le32 npdal; + __u8 rsvd288[3808]; }; +static_assert(sizeof(struct nvme_id_ns_nvm) == 4096); + enum { NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, @@ -590,7 +597,11 @@ enum { enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FEAT_ATOMICS = 1 << 1, - NVME_NS_FEAT_IO_OPT = 1 << 4, + NVME_NS_FEAT_OPTPERF_SHIFT = 4, + /* In NVMe version 2.0 and below, OPTPERF is only bit 4 of NSFEAT */ + NVME_NS_FEAT_OPTPERF_MASK = 0x1, + /* Since version 2.1, OPTPERF is bits 4 and 5 of NSFEAT */ + NVME_NS_FEAT_OPTPERF_MASK_2_1 = 0x3, NVME_NS_ATTR_RO = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, NVME_NS_FLBAS_LBA_UMASK = 0x60, @@ -1837,6 +1848,11 @@ enum { NVME_AUTH_HASH_INVALID = 0xff, }; +/* Maximum digest size for any NVME_AUTH_HASH_* value */ +enum { + NVME_AUTH_MAX_DIGEST_SIZE = 64, +}; + /* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */ enum { NVME_AUTH_DHGROUP_NULL = 0x00, @@ -2332,4 +2348,8 @@ enum nvme_pr_change_ptpl { #define NVME_PR_IGNORE_KEY (1 << 3) +/* Section 8.3.4.5.2 of the NVMe 2.1 */ +#define NVME_AUTH_DHCHAP_MAX_HASH_IDS 30 +#define NVME_AUTH_DHCHAP_MAX_DH_IDS 30 + #endif /* _LINUX_NVME_H */ diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 80f33a93f944..0630430cc01a 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -53,6 +53,11 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_DISCOVERY: case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: + case IOC_OPAL_REACTIVATE_LSP: + case IOC_OPAL_LR_SET_START_LEN: + case IOC_OPAL_ENABLE_DISABLE_LR: + case IOC_OPAL_GET_SUM_STATUS: + case IOC_OPAL_STACK_RESET: return true; } return false; diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h index cd6302def5ed..6cff77f5b857 100644 --- a/include/uapi/linux/bsg.h +++ b/include/uapi/linux/bsg.h @@ -2,6 +2,9 @@ #ifndef _UAPIBSG_H #define _UAPIBSG_H +#ifdef __KERNEL__ +#include +#endif /* __KERNEL__ */ #include #define BSG_PROTOCOL_SCSI 0 @@ -63,5 +66,77 @@ struct sg_io_v4 { __u32 padding; }; +struct bsg_uring_cmd { + __u64 request; /* [i], [*i] command descriptor address */ + __u32 request_len; /* [i] command descriptor length in bytes */ + __u32 protocol; /* [i] protocol type (BSG_PROTOCOL_*) */ + __u32 subprotocol; /* [i] subprotocol type (BSG_SUB_PROTOCOL_*) */ + __u32 max_response_len; /* [i] response buffer size in bytes */ + + __u64 response; /* [i], [*o] response data address */ + __u64 dout_xferp; /* [i], [*i] */ + __u32 dout_xfer_len; /* [i] bytes to be transferred to device */ + __u32 dout_iovec_count; /* [i] 0 -> "flat" dout transfer else + * dout_xferp points to array of iovec + */ + __u64 din_xferp; /* [i], [*o] */ + __u32 din_xfer_len; /* [i] bytes to be transferred from device */ + __u32 din_iovec_count; /* [i] 0 -> "flat" din transfer */ + + __u32 timeout_ms; /* [i] timeout in milliseconds */ + __u8 reserved[12]; /* reserved for future extension */ +}; + +#ifdef __KERNEL__ +/* Must match IORING_OP_URING_CMD payload size (e.g. SQE128). */ +static_assert(sizeof(struct bsg_uring_cmd) == 80); +#endif /* __KERNEL__ */ + + +/* + * SCSI BSG io_uring completion (res2, 64-bit) + * + * When using BSG_PROTOCOL_SCSI + BSG_SUB_PROTOCOL_SCSI_CMD with + * IORING_OP_URING_CMD, the completion queue entry (CQE) contains: + * - result: errno (0 on success) + * - res2: packed SCSI status + * + * res2 bit layout: + * [0..7] device_status (SCSI status byte, e.g. CHECK_CONDITION) + * [8..15] driver_status (e.g. DRIVER_SENSE when sense data is valid) + * [16..23] host_status (e.g. DID_OK, DID_TIME_OUT) + * [24..31] sense_len_wr (bytes of sense data written to response buffer) + * [32..63] resid_len (residual transfer length) + */ +static inline __u8 bsg_scsi_res2_device_status(__u64 res2) +{ + return res2 & 0xff; +} +static inline __u8 bsg_scsi_res2_driver_status(__u64 res2) +{ + return res2 >> 8; +} +static inline __u8 bsg_scsi_res2_host_status(__u64 res2) +{ + return res2 >> 16; +} +static inline __u8 bsg_scsi_res2_sense_len(__u64 res2) +{ + return res2 >> 24; +} +static inline __u32 bsg_scsi_res2_resid_len(__u64 res2) +{ + return res2 >> 32; +} +static inline __u64 bsg_scsi_res2_build(__u8 device_status, __u8 driver_status, + __u8 host_status, __u8 sense_len_wr, + __u32 resid_len) +{ + return ((__u64)(__u32)(resid_len) << 32) | + ((__u64)sense_len_wr << 24) | + ((__u64)host_status << 16) | + ((__u64)driver_status << 8) | + (__u64)device_status; +} #endif /* _UAPIBSG_H */ diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 9025dd5a4f0f..ef4d3be6ca7f 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -74,6 +74,19 @@ struct opal_lr_act { __u8 align[2]; /* Align to 8 byte boundary */ }; +struct opal_lr_react { + struct opal_key key; + struct opal_key new_admin_key; /* Set new Admin1 PIN if key_len is > 0 */ + __u8 num_lrs; /* + * Configure selected ranges (from lr[]) in SUM. + * If num_lrs > 0 the 'entire_table' must be 0 + */ + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; /* Set RangeStartRangeLengthPolicy parameter */ + __u8 entire_table; /* Set all locking objects in SUM */ + __u8 align[4]; /* Align to 8 byte boundary */ +}; + struct opal_session_info { __u32 sum; __u32 who; @@ -98,6 +111,18 @@ struct opal_lr_status { __u8 align[4]; }; +struct opal_sum_ranges { + /* + * Initiate Admin1 session if key_len > 0, + * use Anybody session otherwise. + */ + struct opal_key key; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; + __u8 align[5]; /* Align to 8 byte boundary */ +}; + struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; @@ -216,5 +241,10 @@ struct opal_revert_lsp { #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) +#define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) +#define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) +#define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) +#define IOC_OPAL_GET_SUM_STATUS _IOW('p', 245, struct opal_sum_ranges) +#define IOC_OPAL_STACK_RESET _IO('p', 246) #endif /* _UAPI_SED_OPAL_H */ diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a88876756805..6991370a72ce 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -57,6 +57,45 @@ _IOWR('u', 0x16, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_TRY_STOP_DEV \ _IOWR('u', 0x17, struct ublksrv_ctrl_cmd) +/* + * Register a shared memory buffer for zero-copy I/O. + * Input: ctrl_cmd.addr points to struct ublk_shmem_buf_reg (buffer VA + size) + * ctrl_cmd.len = sizeof(struct ublk_shmem_buf_reg) + * Result: >= 0 is the assigned buffer index, < 0 is error + * + * The kernel pins pages from the calling process's address space + * and inserts PFN ranges into a per-device maple tree. When a block + * request's pages match registered pages, the driver sets + * UBLK_IO_F_SHMEM_ZC and encodes the buffer index + offset in addr, + * allowing the server to access the data via its own mapping of the + * same shared memory — true zero copy. + * + * The memory can be backed by memfd, hugetlbfs, or any GUP-compatible + * shared mapping. Queue freeze is handled internally. + * + * The buffer VA and size are passed via a user buffer (not inline in + * ctrl_cmd) so that unprivileged devices can prepend the device path + * to ctrl_cmd.addr without corrupting the VA. + */ +#define UBLK_U_CMD_REG_BUF \ + _IOWR('u', 0x18, struct ublksrv_ctrl_cmd) +/* + * Unregister a shared memory buffer. + * Input: ctrl_cmd.data[0] = buffer index + */ +#define UBLK_U_CMD_UNREG_BUF \ + _IOWR('u', 0x19, struct ublksrv_ctrl_cmd) + +/* Parameter buffer for UBLK_U_CMD_REG_BUF, pointed to by ctrl_cmd.addr */ +struct ublk_shmem_buf_reg { + __u64 addr; /* userspace virtual address of shared memory */ + __u64 len; /* buffer size in bytes, page-aligned, default max 4GB */ + __u32 flags; + __u32 reserved; +}; + +/* Pin pages without FOLL_WRITE; usable with write-sealed memfd */ +#define UBLK_SHMEM_BUF_READ_ONLY (1U << 0) /* * 64bits are enough now, and it should be easy to extend in case of * running out of feature flags @@ -370,6 +409,14 @@ /* Disable automatic partition scanning when device is started */ #define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18) +/* + * Enable shared memory zero copy. When enabled, the server can register + * shared memory buffers via UBLK_U_CMD_REG_BUF. If a block request's + * pages match a registered buffer, UBLK_IO_F_SHMEM_ZC is set and addr + * encodes the buffer index + offset instead of a userspace buffer address. + */ +#define UBLK_F_SHMEM_ZC (1ULL << 19) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -469,6 +516,12 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_F_NEED_REG_BUF (1U << 17) /* Request has an integrity data buffer */ #define UBLK_IO_F_INTEGRITY (1UL << 18) +/* + * I/O buffer is in a registered shared memory buffer. When set, the addr + * field in ublksrv_io_desc encodes buffer index and byte offset instead + * of a userspace virtual address. + */ +#define UBLK_IO_F_SHMEM_ZC (1U << 19) /* * io cmd is described by this structure, and stored in share memory, indexed @@ -743,4 +796,31 @@ struct ublk_params { struct ublk_param_integrity integrity; }; +/* + * Shared memory zero-copy addr encoding for UBLK_IO_F_SHMEM_ZC. + * + * When UBLK_IO_F_SHMEM_ZC is set, ublksrv_io_desc.addr is encoded as: + * bits [0:31] = byte offset within the buffer (up to 4GB) + * bits [32:47] = buffer index (up to 65536) + * bits [48:63] = reserved (must be zero) + */ +#define UBLK_SHMEM_ZC_OFF_MASK 0xffffffffULL +#define UBLK_SHMEM_ZC_IDX_OFF 32 +#define UBLK_SHMEM_ZC_IDX_MASK 0xffffULL + +static inline __u64 ublk_shmem_zc_addr(__u16 index, __u32 offset) +{ + return ((__u64)index << UBLK_SHMEM_ZC_IDX_OFF) | offset; +} + +static inline __u16 ublk_shmem_zc_index(__u64 addr) +{ + return (addr >> UBLK_SHMEM_ZC_IDX_OFF) & UBLK_SHMEM_ZC_IDX_MASK; +} + +static inline __u32 ublk_shmem_zc_offset(__u64 addr) +{ + return (__u32)(addr & UBLK_SHMEM_ZC_OFF_MASK); +} + #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 94af29d1de88..60e21414624b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3460,7 +3460,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; - if (si->bdev && bdev_nonrot(si->bdev)) { + if (si->bdev && !bdev_rot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 8ac2d4a682a1..ec6a8ce83d38 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -18,6 +18,7 @@ TEST_PROGS += test_generic_10.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_16.sh +TEST_PROGS += test_generic_17.sh TEST_PROGS += test_batch_01.sh TEST_PROGS += test_batch_02.sh @@ -51,6 +52,11 @@ TEST_PROGS += test_stripe_06.sh TEST_PROGS += test_part_01.sh TEST_PROGS += test_part_02.sh +TEST_PROGS += test_shmemzc_01.sh +TEST_PROGS += test_shmemzc_02.sh +TEST_PROGS += test_shmemzc_03.sh +TEST_PROGS += test_shmemzc_04.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index 3b897f69c014..150896e02ff8 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -10,11 +10,17 @@ #include "kublk.h" +struct fi_opts { + long long delay_ns; + bool die_during_fetch; +}; + static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; unsigned long dev_size = 250UL << 30; + struct fi_opts *opts = NULL; if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); @@ -35,17 +41,52 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, }; ublk_set_integrity_params(ctx, &dev->tgt.params); - dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); + opts = calloc(1, sizeof(*opts)); + if (!opts) { + ublk_err("%s: couldn't allocate memory for opts\n", __func__); + return -ENOMEM; + } + + opts->delay_ns = ctx->fault_inject.delay_us * 1000; + opts->die_during_fetch = ctx->fault_inject.die_during_fetch; + dev->private_data = opts; + return 0; } +static void ublk_fault_inject_pre_fetch_io(struct ublk_thread *t, + struct ublk_queue *q, int tag, + bool batch) +{ + struct fi_opts *opts = q->dev->private_data; + + if (!opts->die_during_fetch) + return; + + /* + * Each queue fetches its IOs in increasing order of tags, so + * dying just before we're about to fetch tag 1 (regardless of + * what queue we're on) guarantees that we've fetched a nonempty + * proper subset of the tags on that queue. + */ + if (tag == 1) { + /* + * Ensure our commands are actually live in the kernel + * before we die. + */ + io_uring_submit(&t->ring); + raise(SIGKILL); + } +} + static int ublk_fault_inject_queue_io(struct ublk_thread *t, struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe; + struct fi_opts *opts = q->dev->private_data; struct __kernel_timespec ts = { - .tv_nsec = (long long)q->dev->private_data, + .tv_nsec = opts->delay_ns, }; ublk_io_alloc_sqes(t, &sqe, 1); @@ -77,29 +118,34 @@ static void ublk_fault_inject_cmd_line(struct dev_ctx *ctx, int argc, char *argv { static const struct option longopts[] = { { "delay_us", 1, NULL, 0 }, + { "die_during_fetch", 1, NULL, 0 }, { 0, 0, 0, 0 } }; int option_idx, opt; ctx->fault_inject.delay_us = 0; + ctx->fault_inject.die_during_fetch = false; while ((opt = getopt_long(argc, argv, "", longopts, &option_idx)) != -1) { switch (opt) { case 0: if (!strcmp(longopts[option_idx].name, "delay_us")) ctx->fault_inject.delay_us = strtoll(optarg, NULL, 10); + if (!strcmp(longopts[option_idx].name, "die_during_fetch")) + ctx->fault_inject.die_during_fetch = strtoll(optarg, NULL, 10); } } } static void ublk_fault_inject_usage(const struct ublk_tgt_ops *ops) { - printf("\tfault_inject: [--delay_us us (default 0)]\n"); + printf("\tfault_inject: [--delay_us us (default 0)] [--die_during_fetch 1]\n"); } const struct ublk_tgt_ops fault_inject_tgt_ops = { .name = "fault_inject", .init_tgt = ublk_fault_inject_tgt_init, + .pre_fetch_io = ublk_fault_inject_pre_fetch_io, .queue_io = ublk_fault_inject_queue_io, .tgt_io_done = ublk_fault_inject_tgt_io_done, .parse_cmd_line = ublk_fault_inject_cmd_line, diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 228af2580ac6..d28da98f917a 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -27,6 +27,40 @@ static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, return 1; } +/* + * Shared memory zero-copy I/O: when UBLK_IO_F_SHMEM_ZC is set, the + * request's data lives in a registered shared memory buffer. Decode + * index + offset from iod->addr and use the server's mmap of that + * buffer as the I/O buffer for the backing file. + */ +static int loop_queue_shmem_zc_io(struct ublk_thread *t, struct ublk_queue *q, + const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + enum io_uring_op op = ublk_to_uring_op(iod, 0); + __u64 file_offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; + __u32 shmem_idx = ublk_shmem_zc_index(iod->addr); + __u32 shmem_off = ublk_shmem_zc_offset(iod->addr); + struct io_uring_sqe *sqe[1]; + void *addr; + + if (shmem_idx >= UBLK_BUF_MAX || !shmem_table[shmem_idx].mmap_base) + return -EINVAL; + + addr = shmem_table[shmem_idx].mmap_base + shmem_off; + + ublk_io_alloc_sqes(t, sqe, 1); + if (!sqe[0]) + return -ENOMEM; + + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1), + addr, len, file_offset); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); + return 1; +} + static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) { @@ -41,6 +75,10 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, void *addr = io->buf_addr; unsigned short buf_index = ublk_io_buf_idx(t, q, tag); + /* shared memory zero-copy path */ + if (iod->op_flags & UBLK_IO_F_SHMEM_ZC) + return loop_queue_shmem_zc_io(t, q, iod, tag); + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { ublk_io_alloc_sqes(t, sqe, 1); /* Use second backing file for integrity data */ diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e1c3b3c55e56..fbd9b1e7342a 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -4,6 +4,7 @@ */ #include +#include #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -796,6 +797,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) q = &t->dev->q[q_id]; io = &q->ios[tag]; io->buf_index = j++; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, tag, false); ublk_queue_io_cmd(t, io); } } else { @@ -807,6 +810,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) for (i = 0; i < q->q_depth; i++) { io = &q->ios[i]; io->buf_index = i; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, i, false); ublk_queue_io_cmd(t, io); } } @@ -983,6 +988,9 @@ static void ublk_batch_setup_queues(struct ublk_thread *t) if (t->q_map[i] == 0) continue; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, 0, true); + ret = ublk_batch_queue_prep_io_cmds(t, q); ublk_assert(ret >= 0); } @@ -1085,13 +1093,316 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, } +/* + * Shared memory registration socket listener. + * + * The parent daemon context listens on a per-device unix socket at + * /run/ublk/ublkb.sock for shared memory registration requests + * from clients. Clients send a memfd via SCM_RIGHTS; the server + * registers it with the kernel, mmaps it, and returns the assigned index. + */ +#define UBLK_SHMEM_SOCK_DIR "/run/ublk" + +/* defined in kublk.h, shared with file_backed.c (loop target) */ +struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; +int shmem_count; + +static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len) +{ + snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id); +} + +static int ublk_shmem_sock_create(int dev_id) +{ + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + char path[108]; + int fd; + + mkdir(UBLK_SHMEM_SOCK_DIR, 0755); + ublk_shmem_sock_path(dev_id, path, sizeof(path)); + unlink(path); + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); + if (fd < 0) + return -1; + + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path); + if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + close(fd); + return -1; + } + + listen(fd, 4); + ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path); + return fd; +} + +static void ublk_shmem_sock_destroy(int dev_id, int sock_fd) +{ + char path[108]; + + if (sock_fd >= 0) + close(sock_fd); + ublk_shmem_sock_path(dev_id, path, sizeof(path)); + unlink(path); +} + +/* Receive a memfd from a client via SCM_RIGHTS */ +static int ublk_shmem_recv_fd(int client_fd) +{ + char buf[1]; + struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) }; + union { + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr align; + } u; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = u.cmsg_buf, + .msg_controllen = sizeof(u.cmsg_buf), + }; + struct cmsghdr *cmsg; + + if (recvmsg(client_fd, &msg, 0) <= 0) + return -1; + + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg || cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return -1; + + return *(int *)CMSG_DATA(cmsg); +} + +/* Register a shared memory buffer: store fd, mmap it, return index */ +static int ublk_shmem_register(int shmem_fd) +{ + off_t size; + void *base; + int idx; + + if (shmem_count >= UBLK_BUF_MAX) + return -1; + + size = lseek(shmem_fd, 0, SEEK_END); + if (size <= 0) + return -1; + + base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + shmem_fd, 0); + if (base == MAP_FAILED) + return -1; + + idx = shmem_count++; + shmem_table[idx].fd = shmem_fd; + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = size; + + ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n", + idx, shmem_fd, (size_t)size); + return idx; +} + +static void ublk_shmem_unregister_all(void) +{ + int i; + + for (i = 0; i < shmem_count; i++) { + if (shmem_table[i].mmap_base) { + munmap(shmem_table[i].mmap_base, + shmem_table[i].size); + close(shmem_table[i].fd); + shmem_table[i].mmap_base = NULL; + } + } + shmem_count = 0; +} + +static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size, + __u32 flags) +{ + struct ublk_shmem_buf_reg buf_reg = { + .addr = (unsigned long)addr, + .len = size, + .flags = flags, + }; + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_REG_BUF, + .flags = CTRL_CMD_HAS_BUF, + .addr = (unsigned long)&buf_reg, + .len = sizeof(buf_reg), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +/* + * Handle one client connection: receive memfd, mmap it, register + * the VA range with kernel, send back the assigned index. + */ +static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev) +{ + int client_fd, memfd, idx, ret; + int32_t reply; + off_t size; + void *base; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) + return; + + memfd = ublk_shmem_recv_fd(client_fd); + if (memfd < 0) { + reply = -1; + goto out; + } + + /* mmap the memfd in server address space */ + size = lseek(memfd, 0, SEEK_END); + if (size <= 0) { + reply = -1; + close(memfd); + goto out; + } + base = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, memfd, 0); + if (base == MAP_FAILED) { + reply = -1; + close(memfd); + goto out; + } + + /* Register server's VA range with kernel for PFN matching */ + ret = ublk_ctrl_reg_buf(dev, base, size, 0); + if (ret < 0) { + ublk_dbg(UBLK_DBG_DEV, + "shmem_zc: kernel reg failed %d\n", ret); + munmap(base, size); + close(memfd); + reply = ret; + goto out; + } + + /* Store in table for I/O handling */ + idx = ublk_shmem_register(memfd); + if (idx >= 0) { + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = size; + } + reply = idx; +out: + send(client_fd, &reply, sizeof(reply), 0); + close(client_fd); +} + +struct shmem_listener_info { + int dev_id; + int stop_efd; /* eventfd to signal listener to stop */ + int sock_fd; /* listener socket fd (output) */ + struct ublk_dev *dev; +}; + +/* + * Socket listener thread: runs in the parent daemon context alongside + * the I/O threads. Accepts shared memory registration requests from + * clients via SCM_RIGHTS. Exits when stop_efd is signaled. + */ +static void *ublk_shmem_listener_fn(void *data) +{ + struct shmem_listener_info *info = data; + struct pollfd pfds[2]; + + info->sock_fd = ublk_shmem_sock_create(info->dev_id); + if (info->sock_fd < 0) + return NULL; + + pfds[0].fd = info->sock_fd; + pfds[0].events = POLLIN; + pfds[1].fd = info->stop_efd; + pfds[1].events = POLLIN; + + while (1) { + int ret = poll(pfds, 2, -1); + + if (ret < 0) + break; + + /* Stop signal from parent */ + if (pfds[1].revents & POLLIN) + break; + + /* Client connection */ + if (pfds[0].revents & POLLIN) + ublk_shmem_handle_client(info->sock_fd, info->dev); + } + + return NULL; +} + +static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx, + struct ublk_dev *dev) +{ + int fd, idx, ret; + struct stat st; + void *base; + + fd = open(ctx->htlb_path, O_RDWR); + if (fd < 0) { + ublk_err("htlb: can't open %s\n", ctx->htlb_path); + return -errno; + } + + if (fstat(fd, &st) < 0 || st.st_size <= 0) { + ublk_err("htlb: invalid file size\n"); + close(fd); + return -EINVAL; + } + + base = mmap(NULL, st.st_size, + ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (base == MAP_FAILED) { + ublk_err("htlb: mmap failed\n"); + close(fd); + return -ENOMEM; + } + + ret = ublk_ctrl_reg_buf(dev, base, st.st_size, + ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0); + if (ret < 0) { + ublk_err("htlb: reg_buf failed: %d\n", ret); + munmap(base, st.st_size); + close(fd); + return ret; + } + + if (shmem_count >= UBLK_BUF_MAX) { + munmap(base, st.st_size); + close(fd); + return -ENOMEM; + } + + idx = shmem_count++; + shmem_table[idx].fd = fd; + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = st.st_size; + + ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n", + idx, (size_t)st.st_size); + return 0; +} + static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; + struct shmem_listener_info linfo = {}; struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; + uint64_t stop_val = 1; + pthread_t listener; void *thread_ret; sem_t ready; int ret, i; @@ -1180,15 +1491,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) goto fail_start; } + if (ctx->htlb_path) { + ret = ublk_shmem_htlb_setup(ctx, dev); + if (ret < 0) { + ublk_err("htlb setup failed: %d\n", ret); + ublk_ctrl_stop_dev(dev); + goto fail_start; + } + } + ublk_ctrl_get_info(dev); if (ctx->fg) ublk_ctrl_dump(dev); else ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); fail_start: - /* wait until we are terminated */ - for (i = 0; i < dev->nthreads; i++) + /* + * Wait for I/O threads to exit. While waiting, a listener + * thread accepts shared memory registration requests from + * clients via a per-device unix socket (SCM_RIGHTS fd passing). + */ + linfo.dev_id = dinfo->dev_id; + linfo.dev = dev; + linfo.stop_efd = eventfd(0, 0); + if (linfo.stop_efd >= 0) + pthread_create(&listener, NULL, + ublk_shmem_listener_fn, &linfo); + + for (i = 0; i < (int)dev->nthreads; i++) pthread_join(tinfo[i].thread, &thread_ret); + + /* Signal listener thread to stop and wait for it */ + if (linfo.stop_efd >= 0) { + write(linfo.stop_efd, &stop_val, sizeof(stop_val)); + pthread_join(listener, NULL); + close(linfo.stop_efd); + ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd); + } + ublk_shmem_unregister_all(); free(tinfo); fail: for (i = 0; i < dinfo->nr_hw_queues; i++) @@ -1618,6 +1958,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_SAFE_STOP_DEV), FEAT_NAME(UBLK_F_BATCH_IO), FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), + FEAT_NAME(UBLK_F_SHMEM_ZC), }; struct ublk_dev *dev; __u64 features = 0; @@ -1790,6 +2131,9 @@ int main(int argc, char *argv[]) { "safe", 0, NULL, 0 }, { "batch", 0, NULL, 'b'}, { "no_auto_part_scan", 0, NULL, 0 }, + { "shmem_zc", 0, NULL, 0 }, + { "htlb", 1, NULL, 0 }, + { "rdonly_shmem_buf", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1905,6 +2249,12 @@ int main(int argc, char *argv[]) ctx.safe_stop = 1; if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; + if (!strcmp(longopts[option_idx].name, "shmem_zc")) + ctx.flags |= UBLK_F_SHMEM_ZC; + if (!strcmp(longopts[option_idx].name, "htlb")) + ctx.htlb_path = strdup(optarg); + if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf")) + ctx.rdonly_shmem_buf = 1; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 02f0c55d006b..742c41d77df1 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -60,6 +60,7 @@ struct stripe_ctx { struct fault_inject_ctx { /* fault_inject */ unsigned long delay_us; + bool die_during_fetch; }; struct dev_ctx { @@ -80,6 +81,7 @@ struct dev_ctx { unsigned int no_ublk_fixed_fd:1; unsigned int safe_stop:1; unsigned int no_auto_part_scan:1; + unsigned int rdonly_shmem_buf:1; __u32 integrity_flags; __u8 metadata_size; __u8 pi_offset; @@ -95,6 +97,8 @@ struct dev_ctx { /* for 'update_size' command */ unsigned long long size; + char *htlb_path; + union { struct stripe_ctx stripe; struct fault_inject_ctx fault_inject; @@ -138,6 +142,8 @@ struct ublk_tgt_ops { int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); void (*deinit_tgt)(struct ublk_dev *); + void (*pre_fetch_io)(struct ublk_thread *t, struct ublk_queue *q, + int tag, bool batch); int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag); void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *, const struct io_uring_cqe *); @@ -599,6 +605,18 @@ static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue * } } +/* shared memory zero-copy support */ +#define UBLK_BUF_MAX 256 + +struct ublk_shmem_entry { + int fd; + void *mmap_base; + size_t size; +}; + +extern struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; +extern int shmem_count; + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 163a40007910..af2ea4fa1111 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -88,6 +88,7 @@ _remove_tmp_dir() { _mkfs_mount_test() { local dev=$1 + shift local err_code=0 local mnt_dir; @@ -99,12 +100,17 @@ _mkfs_mount_test() fi mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1 - umount "$dev" - err_code=$? - _remove_tmp_dir "$mnt_dir" - if [ $err_code -ne 0 ]; then - return $err_code + if [ $# -gt 0 ]; then + cd "$mnt_dir" && "$@" + err_code=$? + cd - > /dev/null fi + umount "$dev" + if [ $err_code -eq 0 ]; then + err_code=$? + fi + _remove_tmp_dir "$mnt_dir" + return $err_code } _check_root() { @@ -132,6 +138,7 @@ _prep_test() { local base_dir=${TMPDIR:-./ublktest-dir} mkdir -p "$base_dir" UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX) + UBLK_TEST_DIR=$(realpath ${UBLK_TEST_DIR}) UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg diff --git a/tools/testing/selftests/ublk/test_generic_17.sh b/tools/testing/selftests/ublk/test_generic_17.sh new file mode 100755 index 000000000000..2278b5fc9dba --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_17.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "fault_inject" "teardown after incomplete recovery" + +# First start and stop a ublk server with device configured for recovery +dev_id=$(_add_ublk_dev -t fault_inject -r 1) +_check_add_dev $TID $? +state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED") +if [ "$state" != "QUIESCED" ]; then + echo "device isn't quiesced($state) after $action" + ERR_CODE=255 +fi + +# Then recover the device, but use --die_during_fetch to have the ublk +# server die while a queue has some (but not all) I/Os fetched +${UBLK_PROG} recover -n "${dev_id}" --foreground -t fault_inject --die_during_fetch 1 +RECOVER_RES=$? +# 137 is the result when dying of SIGKILL +if (( RECOVER_RES != 137 )); then + echo "recover command exited with unexpected code ${RECOVER_RES}!" + ERR_CODE=255 +fi + +# Clean up the device. This can only succeed once teardown of the above +# exited ublk server completes. So if teardown never completes, we will +# time out here +_ublk_del_dev "${dev_id}" + +_cleanup_test "fault_inject" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_01.sh b/tools/testing/selftests/ublk/test_shmemzc_01.sh new file mode 100755 index 000000000000..47210af2aa20 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_01.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with hugetlbfs buffer on null target +# +# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED), +# so they share physical pages. The kernel PFN match enables +# zero-copy I/O without socket-based fd passing. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE") +_check_add_dev $TID $? + +fio --name=htlb_zc \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=io_uring \ + --rw=randwrite \ + --direct=1 \ + --bs=4k \ + --size=4M \ + --iodepth=32 \ + --mem=mmaphuge:"$HTLB_FILE" \ + > /dev/null 2>&1 +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_02.sh b/tools/testing/selftests/ublk/test_shmemzc_02.sh new file mode 100755 index 000000000000..aed9262494e9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_02.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with hugetlbfs buffer on loop target +# +# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED), +# so they share physical pages. The kernel PFN match enables +# zero-copy I/O without socket-based fd passing. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +_create_backfile 0 128M +BACKFILE="${UBLK_BACKFILES[0]}" + +dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE") +_check_add_dev $TID $? + +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" \ + --size=128M \ + --mem=mmaphuge:"$HTLB_FILE" +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_03.sh b/tools/testing/selftests/ublk/test_shmemzc_03.sh new file mode 100755 index 000000000000..db967a9ffe81 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_03.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with fio verify over filesystem on loop target +# +# mkfs + mount ext4 on the ublk device, then run fio verify on a +# file inside that filesystem. Exercises the full stack: +# filesystem -> block layer -> ublk shmem_zc -> loop target backing file. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy fs verify test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +_create_backfile 0 256M +BACKFILE="${UBLK_BACKFILES[0]}" + +dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE") +_check_add_dev $TID $? + +_mkfs_mount_test /dev/ublkb"${dev_id}" \ + _run_fio_verify_io --filename=testfile \ + --size=128M \ + --mem=mmaphuge:"$HTLB_FILE" +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_04.sh b/tools/testing/selftests/ublk/test_shmemzc_04.sh new file mode 100755 index 000000000000..899de088ece4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_04.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with read-only buffer registration on null target +# +# Same as test_shmemzc_01 but with --rdonly_shmem_buf: pages are pinned +# without FOLL_WRITE (UBLK_BUF_F_READ). Write I/O works because +# the server only reads from the shared buffer. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy rdonly_buf test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE" --rdonly_shmem_buf) +_check_add_dev $TID $? + +fio --name=htlb_zc_rdonly \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=io_uring \ + --rw=randwrite \ + --direct=1 \ + --bs=4k \ + --size=4M \ + --iodepth=32 \ + --mem=mmaphuge:"$HTLB_FILE" \ + > /dev/null 2>&1 +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE